diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,44719 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1655, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1274.0, + "completions/max_terminated_length": 1274.0, + "completions/mean_length": 553.953125, + "completions/mean_terminated_length": 553.953125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.0006042296072507553, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.050295617431402206, + "learning_rate": 0.0, + "loss": -0.0159, + "num_tokens": 205773.0, + "reward": 3.153773546218872, + "reward_std": 1.5470020771026611, + "rewards/accuracy_reward/mean": 2.403773546218872, + "rewards/accuracy_reward/std": 3.425541877746582, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 509.5, + "completions/mean_terminated_length": 509.5, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.0012084592145015106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03184757009148598, + "learning_rate": 3.6144578313253015e-08, + "loss": 0.0139, + "num_tokens": 398813.0, + "reward": 5.406036376953125, + "reward_std": 1.0645673274993896, + "rewards/accuracy_reward/mean": 4.656036376953125, + "rewards/accuracy_reward/std": 3.5390419960021973, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 527.671875, + "completions/mean_terminated_length": 527.671875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.0018126888217522659, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03984128683805466, + "learning_rate": 7.228915662650603e-08, + "loss": -0.0044, + "num_tokens": 558776.0, + "reward": 6.898014545440674, + "reward_std": 1.7522356510162354, + "rewards/accuracy_reward/mean": 6.148015022277832, + "rewards/accuracy_reward/std": 2.6706326007843018, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 612.859375, + "completions/mean_terminated_length": 612.859375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.002416918429003021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04814017936587334, + "learning_rate": 1.0843373493975904e-07, + "loss": 0.023, + "num_tokens": 698303.0, + "reward": 7.055063724517822, + "reward_std": 2.2661855220794678, + "rewards/accuracy_reward/mean": 6.305063724517822, + "rewards/accuracy_reward/std": 2.668713331222534, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 583.359375, + "completions/mean_terminated_length": 583.359375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.0030211480362537764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038950156420469284, + "learning_rate": 1.4457831325301206e-07, + "loss": -0.0099, + "num_tokens": 889606.0, + "reward": 6.2162580490112305, + "reward_std": 1.6007421016693115, + "rewards/accuracy_reward/mean": 5.466257572174072, + "rewards/accuracy_reward/std": 3.2965898513793945, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1459.0, + "completions/max_terminated_length": 1459.0, + "completions/mean_length": 655.625, + "completions/mean_terminated_length": 655.625, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.0036253776435045317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.028167234733700752, + "learning_rate": 1.8072289156626505e-07, + "loss": 0.0023, + "num_tokens": 1055758.0, + "reward": 4.166179656982422, + "reward_std": 1.6855958700180054, + "rewards/accuracy_reward/mean": 3.41618013381958, + "rewards/accuracy_reward/std": 3.766094446182251, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 610.953125, + "completions/mean_terminated_length": 610.953125, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.004229607250755287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04250720515847206, + "learning_rate": 2.1686746987951808e-07, + "loss": 0.0047, + "num_tokens": 1198331.0, + "reward": 4.911348819732666, + "reward_std": 1.8924040794372559, + "rewards/accuracy_reward/mean": 4.161348819732666, + "rewards/accuracy_reward/std": 3.6122848987579346, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1846.0, + "completions/max_terminated_length": 1846.0, + "completions/mean_length": 701.0625, + "completions/mean_terminated_length": 701.0625, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.004833836858006042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06282279640436172, + "learning_rate": 2.5301204819277107e-07, + "loss": 0.0996, + "num_tokens": 1347679.0, + "reward": 5.200138092041016, + "reward_std": 1.7110226154327393, + "rewards/accuracy_reward/mean": 4.461856842041016, + "rewards/accuracy_reward/std": 3.60711669921875, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 620.53125, + "completions/mean_terminated_length": 597.873046875, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.005438066465256798, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03610637038946152, + "learning_rate": 2.891566265060241e-07, + "loss": -0.0184, + "num_tokens": 1529185.0, + "reward": 2.556605577468872, + "reward_std": 1.924451470375061, + "rewards/accuracy_reward/mean": 1.8183242082595825, + "rewards/accuracy_reward/std": 3.274702787399292, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 571.359375, + "completions/mean_terminated_length": 571.359375, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.006042296072507553, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03285863995552063, + "learning_rate": 3.253012048192771e-07, + "loss": 0.0145, + "num_tokens": 1667944.0, + "reward": 2.9261159896850586, + "reward_std": 1.3922388553619385, + "rewards/accuracy_reward/mean": 2.1761159896850586, + "rewards/accuracy_reward/std": 3.6969473361968994, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 636.65625, + "completions/mean_terminated_length": 636.65625, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.006646525679758308, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03892485424876213, + "learning_rate": 3.614457831325301e-07, + "loss": -0.0812, + "num_tokens": 1834306.0, + "reward": 3.111978054046631, + "reward_std": 1.4843695163726807, + "rewards/accuracy_reward/mean": 2.365884304046631, + "rewards/accuracy_reward/std": 3.519395351409912, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1854.0, + "completions/mean_length": 799.859375, + "completions/mean_terminated_length": 780.0476684570312, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.0072507552870090634, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.005584922153502703, + "learning_rate": 3.9759036144578316e-07, + "loss": -0.0197, + "num_tokens": 2075545.0, + "reward": 0.7134984731674194, + "reward_std": 0.2577509880065918, + "rewards/accuracy_reward/mean": -0.02478281408548355, + "rewards/accuracy_reward/std": 0.3544968366622925, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 557.25, + "completions/mean_terminated_length": 557.25, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.00785498489425982, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.014693841338157654, + "learning_rate": 4.3373493975903615e-07, + "loss": 0.0008, + "num_tokens": 2271689.0, + "reward": 2.458230972290039, + "reward_std": 0.5889573097229004, + "rewards/accuracy_reward/mean": 1.7082310914993286, + "rewards/accuracy_reward/std": 3.191032886505127, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 904.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 520.96875, + "completions/mean_terminated_length": 520.96875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.008459214501510574, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0028674558270722628, + "learning_rate": 4.698795180722892e-07, + "loss": 0.0002, + "num_tokens": 2456119.0, + "reward": 2.5812735557556152, + "reward_std": 0.0900636613368988, + "rewards/accuracy_reward/mean": 1.8312735557556152, + "rewards/accuracy_reward/std": 3.2361526489257812, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 497.5625, + "completions/mean_terminated_length": 497.5625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.00906344410876133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03219522535800934, + "learning_rate": 5.060240963855421e-07, + "loss": 0.0233, + "num_tokens": 2585419.0, + "reward": 7.630303382873535, + "reward_std": 0.924082338809967, + "rewards/accuracy_reward/mean": 6.880303382873535, + "rewards/accuracy_reward/std": 1.8996378183364868, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 497.25, + "completions/mean_terminated_length": 497.25, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.009667673716012085, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029251661151647568, + "learning_rate": 5.421686746987952e-07, + "loss": 0.0081, + "num_tokens": 2759755.0, + "reward": 5.789407730102539, + "reward_std": 1.293677806854248, + "rewards/accuracy_reward/mean": 5.043314456939697, + "rewards/accuracy_reward/std": 3.4291224479675293, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 565.359375, + "completions/mean_terminated_length": 565.359375, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.01027190332326284, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.011992952786386013, + "learning_rate": 5.783132530120482e-07, + "loss": -0.0026, + "num_tokens": 2924866.0, + "reward": 6.20686149597168, + "reward_std": 0.4806945323944092, + "rewards/accuracy_reward/mean": 5.45686149597168, + "rewards/accuracy_reward/std": 3.3082451820373535, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 522.359375, + "completions/mean_terminated_length": 522.359375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.010876132930513595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04265030845999718, + "learning_rate": 6.144578313253012e-07, + "loss": -0.0181, + "num_tokens": 3081769.0, + "reward": 5.922054290771484, + "reward_std": 2.3661410808563232, + "rewards/accuracy_reward/mean": 5.172054290771484, + "rewards/accuracy_reward/std": 3.3974478244781494, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 487.921875, + "completions/mean_terminated_length": 487.921875, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.011480362537764351, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03564700856804848, + "learning_rate": 6.506024096385542e-07, + "loss": -0.0097, + "num_tokens": 3282612.0, + "reward": 4.637045860290527, + "reward_std": 1.5894155502319336, + "rewards/accuracy_reward/mean": 3.8870458602905273, + "rewards/accuracy_reward/std": 3.6724298000335693, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 689.0, + "completions/mean_length": 528.5, + "completions/mean_terminated_length": 504.3809814453125, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.012084592145015106, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.031159086152911186, + "learning_rate": 6.867469879518072e-07, + "loss": 0.0012, + "num_tokens": 3468452.0, + "reward": 3.7171807289123535, + "reward_std": 1.0332908630371094, + "rewards/accuracy_reward/mean": 2.9788994789123535, + "rewards/accuracy_reward/std": 3.6646692752838135, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1523.0, + "completions/max_terminated_length": 1523.0, + "completions/mean_length": 796.921875, + "completions/mean_terminated_length": 796.921875, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.012688821752265862, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05375707522034645, + "learning_rate": 7.228915662650602e-07, + "loss": 0.0002, + "num_tokens": 3657903.0, + "reward": 3.628514051437378, + "reward_std": 3.31272554397583, + "rewards/accuracy_reward/mean": 2.878514289855957, + "rewards/accuracy_reward/std": 3.5807807445526123, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 709.0625, + "completions/mean_terminated_length": 709.0625, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.013293051359516616, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.024670317769050598, + "learning_rate": 7.590361445783132e-07, + "loss": -0.0033, + "num_tokens": 3848851.0, + "reward": 2.230384349822998, + "reward_std": 0.5281810164451599, + "rewards/accuracy_reward/mean": 1.480384349822998, + "rewards/accuracy_reward/std": 3.3552050590515137, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 571.65625, + "completions/mean_terminated_length": 571.65625, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.013897280966767372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03648856282234192, + "learning_rate": 7.951807228915663e-07, + "loss": 0.0119, + "num_tokens": 4014061.0, + "reward": 6.882045269012451, + "reward_std": 1.7237820625305176, + "rewards/accuracy_reward/mean": 6.132045269012451, + "rewards/accuracy_reward/std": 2.8418400287628174, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 662.078125, + "completions/mean_terminated_length": 662.078125, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.014501510574018127, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029896289110183716, + "learning_rate": 8.313253012048193e-07, + "loss": 0.0064, + "num_tokens": 4212338.0, + "reward": 3.9328091144561768, + "reward_std": 1.3493698835372925, + "rewards/accuracy_reward/mean": 3.182809352874756, + "rewards/accuracy_reward/std": 3.7394089698791504, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1321.0, + "completions/max_terminated_length": 1321.0, + "completions/mean_length": 619.890625, + "completions/mean_terminated_length": 619.890625, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.015105740181268883, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03240245208144188, + "learning_rate": 8.674698795180723e-07, + "loss": 0.024, + "num_tokens": 4486971.0, + "reward": 3.7298223972320557, + "reward_std": 1.0474045276641846, + "rewards/accuracy_reward/mean": 2.9798223972320557, + "rewards/accuracy_reward/std": 3.7330572605133057, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 602.265625, + "completions/mean_terminated_length": 579.3175048828125, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.01570996978851964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05332217365503311, + "learning_rate": 9.036144578313254e-07, + "loss": -0.0224, + "num_tokens": 4626396.0, + "reward": 6.073330402374268, + "reward_std": 3.480088233947754, + "rewards/accuracy_reward/mean": 5.335049152374268, + "rewards/accuracy_reward/std": 3.3871586322784424, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1370.0, + "completions/mean_length": 689.96875, + "completions/mean_terminated_length": 668.4127197265625, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.016314199395770394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043454255908727646, + "learning_rate": 9.397590361445784e-07, + "loss": -0.0439, + "num_tokens": 4789930.0, + "reward": 4.447166442871094, + "reward_std": 2.5489754676818848, + "rewards/accuracy_reward/mean": 3.7127914428710938, + "rewards/accuracy_reward/std": 3.794985771179199, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1627.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 555.296875, + "completions/mean_terminated_length": 555.296875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.016918429003021148, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01163510326296091, + "learning_rate": 9.759036144578313e-07, + "loss": -0.0012, + "num_tokens": 4977485.0, + "reward": 2.690523147583008, + "reward_std": 0.5082286596298218, + "rewards/accuracy_reward/mean": 1.9444295167922974, + "rewards/accuracy_reward/std": 3.2972168922424316, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 697.328125, + "completions/mean_terminated_length": 675.888916015625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.017522658610271902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057535089552402496, + "learning_rate": 1.0120481927710843e-06, + "loss": -0.0106, + "num_tokens": 5121058.0, + "reward": 3.759157657623291, + "reward_std": 2.921753168106079, + "rewards/accuracy_reward/mean": 3.020876407623291, + "rewards/accuracy_reward/std": 3.799668312072754, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 719.734375, + "completions/mean_terminated_length": 698.6508178710938, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.01812688821752266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04282220080494881, + "learning_rate": 1.0481927710843375e-06, + "loss": -0.0302, + "num_tokens": 5280577.0, + "reward": 5.015376091003418, + "reward_std": 2.398833751678467, + "rewards/accuracy_reward/mean": 4.27709436416626, + "rewards/accuracy_reward/std": 3.735231876373291, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1333.0, + "completions/max_terminated_length": 1333.0, + "completions/mean_length": 595.359375, + "completions/mean_terminated_length": 595.359375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.018731117824773415, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05114954710006714, + "learning_rate": 1.0843373493975905e-06, + "loss": -0.011, + "num_tokens": 5439240.0, + "reward": 3.3853297233581543, + "reward_std": 2.6987695693969727, + "rewards/accuracy_reward/mean": 2.6353297233581543, + "rewards/accuracy_reward/std": 3.6162471771240234, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 833.21875, + "completions/mean_terminated_length": 794.0322265625, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "epoch": 0.01933534743202417, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05964215472340584, + "learning_rate": 1.1204819277108433e-06, + "loss": -0.048, + "num_tokens": 5679222.0, + "reward": 3.6159236431121826, + "reward_std": 2.9975881576538086, + "rewards/accuracy_reward/mean": 2.8893609046936035, + "rewards/accuracy_reward/std": 3.653923988342285, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 607.78125, + "completions/mean_terminated_length": 607.78125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.019939577039274924, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.039370499551296234, + "learning_rate": 1.1566265060240965e-06, + "loss": 0.0149, + "num_tokens": 5855688.0, + "reward": 3.7163641452789307, + "reward_std": 1.5974831581115723, + "rewards/accuracy_reward/mean": 2.9663639068603516, + "rewards/accuracy_reward/std": 3.7167491912841797, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 605.828125, + "completions/mean_terminated_length": 605.828125, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.02054380664652568, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.004608116112649441, + "learning_rate": 1.1927710843373495e-06, + "loss": 0.0013, + "num_tokens": 6010077.0, + "reward": 0.671875, + "reward_std": 0.17430339753627777, + "rewards/accuracy_reward/mean": -0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 631.03125, + "completions/mean_terminated_length": 631.03125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.021148036253776436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033892471343278885, + "learning_rate": 1.2289156626506025e-06, + "loss": -0.0201, + "num_tokens": 6169967.0, + "reward": 3.5952889919281006, + "reward_std": 1.0775911808013916, + "rewards/accuracy_reward/mean": 2.8452889919281006, + "rewards/accuracy_reward/std": 3.553039073944092, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 560.65625, + "completions/mean_terminated_length": 560.65625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.02175226586102719, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.043978251516819, + "learning_rate": 1.2650602409638555e-06, + "loss": -0.0094, + "num_tokens": 6316505.0, + "reward": 2.8188905715942383, + "reward_std": 1.8558154106140137, + "rewards/accuracy_reward/mean": 2.0688905715942383, + "rewards/accuracy_reward/std": 3.3709847927093506, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 620.765625, + "completions/mean_terminated_length": 620.765625, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.022356495468277945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04905589297413826, + "learning_rate": 1.3012048192771085e-06, + "loss": 0.0188, + "num_tokens": 6511994.0, + "reward": 6.556213855743408, + "reward_std": 2.4705023765563965, + "rewards/accuracy_reward/mean": 5.806214332580566, + "rewards/accuracy_reward/std": 3.150972604751587, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1615.0, + "completions/max_terminated_length": 1615.0, + "completions/mean_length": 795.140625, + "completions/mean_terminated_length": 795.140625, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "epoch": 0.022960725075528703, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020618027076125145, + "learning_rate": 1.3373493975903615e-06, + "loss": -0.0098, + "num_tokens": 6748499.0, + "reward": 3.023449659347534, + "reward_std": 0.84998619556427, + "rewards/accuracy_reward/mean": 2.273449659347534, + "rewards/accuracy_reward/std": 3.375208616256714, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1522.0, + "completions/max_terminated_length": 1522.0, + "completions/mean_length": 712.203125, + "completions/mean_terminated_length": 712.203125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.023564954682779457, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0517582893371582, + "learning_rate": 1.3734939759036144e-06, + "loss": 0.0091, + "num_tokens": 7018320.0, + "reward": 4.219972610473633, + "reward_std": 2.7555487155914307, + "rewards/accuracy_reward/mean": 3.469972848892212, + "rewards/accuracy_reward/std": 3.7151989936828613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 597.59375, + "completions/mean_terminated_length": 574.5714721679688, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.02416918429003021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03575087711215019, + "learning_rate": 1.4096385542168674e-06, + "loss": -0.0218, + "num_tokens": 7187958.0, + "reward": 3.9755351543426514, + "reward_std": 1.8154646158218384, + "rewards/accuracy_reward/mean": 3.2372541427612305, + "rewards/accuracy_reward/std": 3.733604669570923, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1287.0, + "completions/mean_length": 669.3125, + "completions/mean_terminated_length": 647.4285888671875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.024773413897280966, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03482973203063011, + "learning_rate": 1.4457831325301204e-06, + "loss": -0.0025, + "num_tokens": 7374250.0, + "reward": 3.6707019805908203, + "reward_std": 1.6172971725463867, + "rewards/accuracy_reward/mean": 2.9324207305908203, + "rewards/accuracy_reward/std": 3.5779407024383545, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 551.359375, + "completions/mean_terminated_length": 551.359375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.025377643504531724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03713849186897278, + "learning_rate": 1.4819277108433734e-06, + "loss": -0.0005, + "num_tokens": 7655361.0, + "reward": 3.463892936706543, + "reward_std": 1.1130719184875488, + "rewards/accuracy_reward/mean": 2.717799186706543, + "rewards/accuracy_reward/std": 3.6059978008270264, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1990.0, + "completions/max_terminated_length": 1990.0, + "completions/mean_length": 720.53125, + "completions/mean_terminated_length": 720.53125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.025981873111782478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03813225403428078, + "learning_rate": 1.5180722891566264e-06, + "loss": -0.0073, + "num_tokens": 7852019.0, + "reward": 2.5013504028320312, + "reward_std": 1.9672629833221436, + "rewards/accuracy_reward/mean": 1.7513505220413208, + "rewards/accuracy_reward/std": 3.0882129669189453, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 528.875, + "completions/mean_terminated_length": 528.875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.026586102719033233, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.044228166341781616, + "learning_rate": 1.5542168674698796e-06, + "loss": 0.0058, + "num_tokens": 7965259.0, + "reward": 4.412381649017334, + "reward_std": 2.6449060440063477, + "rewards/accuracy_reward/mean": 3.666287899017334, + "rewards/accuracy_reward/std": 3.7189571857452393, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 558.09375, + "completions/mean_terminated_length": 558.09375, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.027190332326283987, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04005519300699234, + "learning_rate": 1.5903614457831326e-06, + "loss": 0.0469, + "num_tokens": 8111217.0, + "reward": 5.932765483856201, + "reward_std": 1.6713883876800537, + "rewards/accuracy_reward/mean": 5.182765483856201, + "rewards/accuracy_reward/std": 3.43211030960083, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 630.328125, + "completions/mean_terminated_length": 630.328125, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.027794561933534745, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05692308023571968, + "learning_rate": 1.6265060240963854e-06, + "loss": 0.0193, + "num_tokens": 8291638.0, + "reward": 4.417205810546875, + "reward_std": 3.2755367755889893, + "rewards/accuracy_reward/mean": 3.671112298965454, + "rewards/accuracy_reward/std": 3.6898179054260254, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 590.671875, + "completions/mean_terminated_length": 590.671875, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.0283987915407855, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.029036523774266243, + "learning_rate": 1.6626506024096386e-06, + "loss": -0.0048, + "num_tokens": 8427137.0, + "reward": 3.0628061294555664, + "reward_std": 0.8525978326797485, + "rewards/accuracy_reward/mean": 2.3128063678741455, + "rewards/accuracy_reward/std": 3.4938251972198486, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 562.390625, + "completions/mean_terminated_length": 562.390625, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.029003021148036254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04557610675692558, + "learning_rate": 1.6987951807228918e-06, + "loss": -0.0134, + "num_tokens": 8593898.0, + "reward": 6.4026947021484375, + "reward_std": 2.981767416000366, + "rewards/accuracy_reward/mean": 5.6526947021484375, + "rewards/accuracy_reward/std": 3.231515884399414, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 663.765625, + "completions/mean_terminated_length": 663.765625, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "epoch": 0.029607250755287008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02721475251019001, + "learning_rate": 1.7349397590361446e-06, + "loss": 0.0099, + "num_tokens": 8752107.0, + "reward": 4.366235733032227, + "reward_std": 1.200506329536438, + "rewards/accuracy_reward/mean": 3.6162359714508057, + "rewards/accuracy_reward/std": 3.7534236907958984, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 638.0, + "completions/mean_terminated_length": 638.0, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.030211480362537766, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03480658680200577, + "learning_rate": 1.7710843373493976e-06, + "loss": 0.0204, + "num_tokens": 8931515.0, + "reward": 4.112286567687988, + "reward_std": 1.931910514831543, + "rewards/accuracy_reward/mean": 3.3622865676879883, + "rewards/accuracy_reward/std": 3.723261594772339, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 570.609375, + "completions/mean_terminated_length": 570.609375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.03081570996978852, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01764088124036789, + "learning_rate": 1.8072289156626508e-06, + "loss": -0.0011, + "num_tokens": 9096786.0, + "reward": 4.253335475921631, + "reward_std": 0.5297549962997437, + "rewards/accuracy_reward/mean": 3.507241725921631, + "rewards/accuracy_reward/std": 3.68384051322937, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 641.015625, + "completions/mean_terminated_length": 641.015625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.03141993957703928, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03309987112879753, + "learning_rate": 1.8433734939759036e-06, + "loss": 0.0151, + "num_tokens": 9279827.0, + "reward": 4.3037028312683105, + "reward_std": 1.773489236831665, + "rewards/accuracy_reward/mean": 3.5537028312683105, + "rewards/accuracy_reward/std": 3.6202638149261475, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 969.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 563.046875, + "completions/mean_terminated_length": 563.046875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.03202416918429003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03278219699859619, + "learning_rate": 1.8795180722891568e-06, + "loss": 0.0042, + "num_tokens": 9422022.0, + "reward": 5.699906826019287, + "reward_std": 1.460031270980835, + "rewards/accuracy_reward/mean": 4.949906826019287, + "rewards/accuracy_reward/std": 3.375742197036743, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 540.671875, + "completions/mean_terminated_length": 540.671875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.03262839879154079, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029308876022696495, + "learning_rate": 1.9156626506024094e-06, + "loss": 0.0129, + "num_tokens": 9568737.0, + "reward": 4.868650913238525, + "reward_std": 0.873051643371582, + "rewards/accuracy_reward/mean": 4.118650436401367, + "rewards/accuracy_reward/std": 3.6966919898986816, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 564.640625, + "completions/mean_terminated_length": 564.640625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.03323262839879154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03402522951364517, + "learning_rate": 1.9518072289156626e-06, + "loss": 0.0022, + "num_tokens": 9753258.0, + "reward": 5.687193870544434, + "reward_std": 1.4355798959732056, + "rewards/accuracy_reward/mean": 4.937193870544434, + "rewards/accuracy_reward/std": 3.4618282318115234, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 683.90625, + "completions/mean_terminated_length": 683.90625, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.033836858006042296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003007667837664485, + "learning_rate": 1.987951807228916e-06, + "loss": -0.0007, + "num_tokens": 9916596.0, + "reward": 4.525036334991455, + "reward_std": 0.1064966544508934, + "rewards/accuracy_reward/mean": 3.775036096572876, + "rewards/accuracy_reward/std": 3.601924419403076, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 542.609375, + "completions/mean_terminated_length": 518.7142944335938, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.03444108761329305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.021038122475147247, + "learning_rate": 2.0240963855421686e-06, + "loss": -0.0037, + "num_tokens": 10064763.0, + "reward": 4.0408782958984375, + "reward_std": 0.9518296718597412, + "rewards/accuracy_reward/mean": 3.3025975227355957, + "rewards/accuracy_reward/std": 3.7252261638641357, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 530.78125, + "completions/mean_terminated_length": 530.78125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.035045317220543805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049080900847911835, + "learning_rate": 2.0602409638554218e-06, + "loss": 0.01, + "num_tokens": 10287101.0, + "reward": 4.322224140167236, + "reward_std": 3.189997911453247, + "rewards/accuracy_reward/mean": 3.5722241401672363, + "rewards/accuracy_reward/std": 3.7489006519317627, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 635.59375, + "completions/mean_terminated_length": 635.59375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.03564954682779456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04173879697918892, + "learning_rate": 2.096385542168675e-06, + "loss": 0.0141, + "num_tokens": 10506659.0, + "reward": 3.2364718914031982, + "reward_std": 2.1587438583374023, + "rewards/accuracy_reward/mean": 2.4864718914031982, + "rewards/accuracy_reward/std": 3.5810294151306152, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1287.0, + "completions/max_terminated_length": 1287.0, + "completions/mean_length": 603.875, + "completions/mean_terminated_length": 603.875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.03625377643504532, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.017444007098674774, + "learning_rate": 2.1325301204819278e-06, + "loss": 0.0094, + "num_tokens": 10690843.0, + "reward": 4.363163948059082, + "reward_std": 0.498243510723114, + "rewards/accuracy_reward/mean": 3.613164186477661, + "rewards/accuracy_reward/std": 3.7447102069854736, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1696.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 691.296875, + "completions/mean_terminated_length": 691.296875, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.036858006042296075, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028560712933540344, + "learning_rate": 2.168674698795181e-06, + "loss": 0.0029, + "num_tokens": 10850350.0, + "reward": 2.7795982360839844, + "reward_std": 1.143225908279419, + "rewards/accuracy_reward/mean": 2.0295984745025635, + "rewards/accuracy_reward/std": 3.2984557151794434, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 716.71875, + "completions/mean_terminated_length": 716.71875, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.03746223564954683, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029387684538960457, + "learning_rate": 2.2048192771084338e-06, + "loss": 0.0182, + "num_tokens": 11132860.0, + "reward": 4.184628963470459, + "reward_std": 0.8651109933853149, + "rewards/accuracy_reward/mean": 3.43853497505188, + "rewards/accuracy_reward/std": 3.8097119331359863, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1174.0, + "completions/max_terminated_length": 1174.0, + "completions/mean_length": 571.328125, + "completions/mean_terminated_length": 571.328125, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.038066465256797584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03238219395279884, + "learning_rate": 2.2409638554216865e-06, + "loss": 0.008, + "num_tokens": 11284321.0, + "reward": 5.68924617767334, + "reward_std": 1.4605605602264404, + "rewards/accuracy_reward/mean": 4.93924617767334, + "rewards/accuracy_reward/std": 3.515507698059082, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 612.890625, + "completions/mean_terminated_length": 612.890625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.03867069486404834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05453099310398102, + "learning_rate": 2.2771084337349398e-06, + "loss": -0.0505, + "num_tokens": 11474650.0, + "reward": 4.535656929016113, + "reward_std": 3.223883867263794, + "rewards/accuracy_reward/mean": 3.7856569290161133, + "rewards/accuracy_reward/std": 3.7283172607421875, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 493.15625, + "completions/mean_terminated_length": 468.4762268066406, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.03927492447129909, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04980238899588585, + "learning_rate": 2.313253012048193e-06, + "loss": 0.0215, + "num_tokens": 11593300.0, + "reward": 3.577423572540283, + "reward_std": 2.5298619270324707, + "rewards/accuracy_reward/mean": 2.839142322540283, + "rewards/accuracy_reward/std": 3.643772602081299, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 602.078125, + "completions/mean_terminated_length": 602.078125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.03987915407854985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03463369607925415, + "learning_rate": 2.3493975903614457e-06, + "loss": 0.0027, + "num_tokens": 11745449.0, + "reward": 7.631005764007568, + "reward_std": 1.6636110544204712, + "rewards/accuracy_reward/mean": 6.88100528717041, + "rewards/accuracy_reward/std": 1.9266676902770996, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 671.265625, + "completions/mean_terminated_length": 626.8547973632812, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.0404833836858006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06322990357875824, + "learning_rate": 2.385542168674699e-06, + "loss": -0.0692, + "num_tokens": 11909210.0, + "reward": 2.8810508251190186, + "reward_std": 3.370999336242676, + "rewards/accuracy_reward/mean": 2.1544883251190186, + "rewards/accuracy_reward/std": 3.451857089996338, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 551.828125, + "completions/mean_terminated_length": 551.828125, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.04108761329305136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026488380506634712, + "learning_rate": 2.421686746987952e-06, + "loss": 0.0138, + "num_tokens": 12154943.0, + "reward": 6.3821916580200195, + "reward_std": 1.101296305656433, + "rewards/accuracy_reward/mean": 5.6321916580200195, + "rewards/accuracy_reward/std": 3.1816396713256836, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1721.0, + "completions/mean_length": 708.421875, + "completions/mean_terminated_length": 687.1587524414062, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.04169184290030212, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.044326942414045334, + "learning_rate": 2.457831325301205e-06, + "loss": -0.0349, + "num_tokens": 12374634.0, + "reward": 2.728135108947754, + "reward_std": 2.013517379760742, + "rewards/accuracy_reward/mean": 1.9859474897384644, + "rewards/accuracy_reward/std": 3.318373918533325, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 509.359375, + "completions/mean_terminated_length": 509.359375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.04229607250755287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045593831688165665, + "learning_rate": 2.4939759036144577e-06, + "loss": 0.0024, + "num_tokens": 12502209.0, + "reward": 5.030801296234131, + "reward_std": 2.8695926666259766, + "rewards/accuracy_reward/mean": 4.280800819396973, + "rewards/accuracy_reward/std": 3.618844985961914, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 520.0625, + "completions/mean_terminated_length": 520.0625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.042900302114803626, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034331031143665314, + "learning_rate": 2.530120481927711e-06, + "loss": -0.0044, + "num_tokens": 12682453.0, + "reward": 3.0446548461914062, + "reward_std": 1.8096741437911987, + "rewards/accuracy_reward/mean": 2.2985613346099854, + "rewards/accuracy_reward/std": 3.4728713035583496, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 505.59375, + "completions/mean_terminated_length": 505.59375, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.04350453172205438, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03359673172235489, + "learning_rate": 2.5662650602409637e-06, + "loss": 0.0085, + "num_tokens": 12819675.0, + "reward": 5.302990436553955, + "reward_std": 1.5996900796890259, + "rewards/accuracy_reward/mean": 4.552990436553955, + "rewards/accuracy_reward/std": 3.5575504302978516, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 566.625, + "completions/mean_terminated_length": 518.8386840820312, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.044108761329305135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04470408707857132, + "learning_rate": 2.602409638554217e-06, + "loss": -0.0463, + "num_tokens": 12968083.0, + "reward": 5.939455986022949, + "reward_std": 2.485675096511841, + "rewards/accuracy_reward/mean": 5.212893486022949, + "rewards/accuracy_reward/std": 3.4402058124542236, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1737.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 735.453125, + "completions/mean_terminated_length": 735.453125, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.04471299093655589, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029334330931305885, + "learning_rate": 2.6385542168674697e-06, + "loss": 0.0101, + "num_tokens": 13135472.0, + "reward": 4.990227699279785, + "reward_std": 1.2241981029510498, + "rewards/accuracy_reward/mean": 4.240227699279785, + "rewards/accuracy_reward/std": 3.679255247116089, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 578.03125, + "completions/mean_terminated_length": 578.03125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.045317220543806644, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.017101343721151352, + "learning_rate": 2.674698795180723e-06, + "loss": 0.0088, + "num_tokens": 13348914.0, + "reward": 2.478384256362915, + "reward_std": 0.5276368260383606, + "rewards/accuracy_reward/mean": 1.7283843755722046, + "rewards/accuracy_reward/std": 3.188217878341675, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 549.4375, + "completions/mean_terminated_length": 549.4375, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.045921450151057405, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0338568314909935, + "learning_rate": 2.710843373493976e-06, + "loss": 0.0072, + "num_tokens": 13598878.0, + "reward": 5.410005569458008, + "reward_std": 1.5816714763641357, + "rewards/accuracy_reward/mean": 4.660005569458008, + "rewards/accuracy_reward/std": 3.638256549835205, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 566.4375, + "completions/mean_terminated_length": 566.4375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.04652567975830816, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03259054198861122, + "learning_rate": 2.746987951807229e-06, + "loss": 0.0202, + "num_tokens": 13808426.0, + "reward": 5.613226413726807, + "reward_std": 0.968408465385437, + "rewards/accuracy_reward/mean": 4.863226413726807, + "rewards/accuracy_reward/std": 3.568685531616211, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 693.84375, + "completions/mean_terminated_length": 693.84375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.047129909365558914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044690512120723724, + "learning_rate": 2.783132530120482e-06, + "loss": 0.0009, + "num_tokens": 14011104.0, + "reward": 4.220085144042969, + "reward_std": 2.404521942138672, + "rewards/accuracy_reward/mean": 3.470085620880127, + "rewards/accuracy_reward/std": 3.7446470260620117, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 545.9375, + "completions/mean_terminated_length": 545.9375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.04773413897280967, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.040847297757864, + "learning_rate": 2.819277108433735e-06, + "loss": -0.0111, + "num_tokens": 14163548.0, + "reward": 5.609569549560547, + "reward_std": 1.8820838928222656, + "rewards/accuracy_reward/mean": 4.871288299560547, + "rewards/accuracy_reward/std": 3.56123423576355, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 569.5, + "completions/mean_terminated_length": 569.5, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.04833836858006042, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02404641918838024, + "learning_rate": 2.855421686746988e-06, + "loss": 0.0027, + "num_tokens": 14376396.0, + "reward": 2.96919584274292, + "reward_std": 0.7573180794715881, + "rewards/accuracy_reward/mean": 2.21919584274292, + "rewards/accuracy_reward/std": 3.4422881603240967, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 582.203125, + "completions/mean_terminated_length": 582.203125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.04894259818731118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03504716977477074, + "learning_rate": 2.891566265060241e-06, + "loss": -0.0148, + "num_tokens": 14525609.0, + "reward": 4.408175468444824, + "reward_std": 1.673645257949829, + "rewards/accuracy_reward/mean": 3.658175468444824, + "rewards/accuracy_reward/std": 3.92238187789917, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 573.515625, + "completions/mean_terminated_length": 573.515625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.04954682779456193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03911672160029411, + "learning_rate": 2.927710843373494e-06, + "loss": 0.0271, + "num_tokens": 14677642.0, + "reward": 3.7531707286834717, + "reward_std": 2.4153661727905273, + "rewards/accuracy_reward/mean": 3.0031707286834717, + "rewards/accuracy_reward/std": 3.829704761505127, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 577.765625, + "completions/mean_terminated_length": 577.765625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.050151057401812686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04043665900826454, + "learning_rate": 2.963855421686747e-06, + "loss": 0.0202, + "num_tokens": 14821259.0, + "reward": 5.529609680175781, + "reward_std": 1.718989372253418, + "rewards/accuracy_reward/mean": 4.779609680175781, + "rewards/accuracy_reward/std": 3.4525961875915527, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 596.78125, + "completions/mean_terminated_length": 596.78125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.05075528700906345, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0216986034065485, + "learning_rate": 3e-06, + "loss": 0.0002, + "num_tokens": 14989549.0, + "reward": 1.0827317237854004, + "reward_std": 0.9381287693977356, + "rewards/accuracy_reward/mean": 0.3327317237854004, + "rewards/accuracy_reward/std": 1.3000915050506592, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 581.15625, + "completions/mean_terminated_length": 581.15625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.0513595166163142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03526873514056206, + "learning_rate": 2.9999973041340697e-06, + "loss": 0.0188, + "num_tokens": 15147559.0, + "reward": 5.6801652908325195, + "reward_std": 1.8443583250045776, + "rewards/accuracy_reward/mean": 4.9301652908325195, + "rewards/accuracy_reward/std": 3.5995872020721436, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 627.75, + "completions/mean_terminated_length": 627.75, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.051963746223564956, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.029386617243289948, + "learning_rate": 2.999989216547045e-06, + "loss": 0.0197, + "num_tokens": 15299047.0, + "reward": 2.8432297706604004, + "reward_std": 1.2093195915222168, + "rewards/accuracy_reward/mean": 2.0932297706604004, + "rewards/accuracy_reward/std": 3.3727924823760986, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 651.515625, + "completions/mean_terminated_length": 651.515625, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.05256797583081571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.051246277987957, + "learning_rate": 2.9999757372712276e-06, + "loss": -0.0038, + "num_tokens": 15501016.0, + "reward": 3.644287109375, + "reward_std": 2.4695963859558105, + "rewards/accuracy_reward/mean": 2.894287109375, + "rewards/accuracy_reward/std": 3.6780827045440674, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 636.609375, + "completions/mean_terminated_length": 614.2063598632812, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.053172205438066465, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05922790616750717, + "learning_rate": 2.9999568663604516e-06, + "loss": -0.0334, + "num_tokens": 15749919.0, + "reward": 6.226980209350586, + "reward_std": 2.729153633117676, + "rewards/accuracy_reward/mean": 5.488698482513428, + "rewards/accuracy_reward/std": 3.346874475479126, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 590.8125, + "completions/mean_terminated_length": 590.8125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.05377643504531722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05123714730143547, + "learning_rate": 2.9999326038900847e-06, + "loss": -0.0032, + "num_tokens": 15908803.0, + "reward": 6.631827354431152, + "reward_std": 2.559030532836914, + "rewards/accuracy_reward/mean": 5.893545627593994, + "rewards/accuracy_reward/std": 2.9811899662017822, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.0, + "completions/max_terminated_length": 1554.0, + "completions/mean_length": 614.0, + "completions/mean_terminated_length": 614.0, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.054380664652567974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04847027361392975, + "learning_rate": 2.999902949957029e-06, + "loss": 0.021, + "num_tokens": 16088979.0, + "reward": 5.033849239349365, + "reward_std": 2.9874229431152344, + "rewards/accuracy_reward/mean": 4.283849239349365, + "rewards/accuracy_reward/std": 3.722419023513794, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 546.4375, + "completions/mean_terminated_length": 546.4375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.05498489425981873, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03147951140999794, + "learning_rate": 2.999867904679718e-06, + "loss": -0.0057, + "num_tokens": 16323583.0, + "reward": 3.696742534637451, + "reward_std": 1.1135358810424805, + "rewards/accuracy_reward/mean": 2.946742296218872, + "rewards/accuracy_reward/std": 3.661346435546875, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 625.34375, + "completions/mean_terminated_length": 625.34375, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.05558912386706949, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043377071619033813, + "learning_rate": 2.9998274681981186e-06, + "loss": -0.0017, + "num_tokens": 16467669.0, + "reward": 6.6405029296875, + "reward_std": 1.8959565162658691, + "rewards/accuracy_reward/mean": 5.8905029296875, + "rewards/accuracy_reward/std": 3.0401198863983154, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 532.515625, + "completions/mean_terminated_length": 532.515625, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.056193353474320244, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03810126706957817, + "learning_rate": 2.9997816406737287e-06, + "loss": 0.0071, + "num_tokens": 16639478.0, + "reward": 3.734386920928955, + "reward_std": 2.148244619369507, + "rewards/accuracy_reward/mean": 2.984386920928955, + "rewards/accuracy_reward/std": 3.5909066200256348, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 521.6875, + "completions/mean_terminated_length": 521.6875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.056797583081571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02940271981060505, + "learning_rate": 2.9997304222895776e-06, + "loss": 0.003, + "num_tokens": 16782018.0, + "reward": 4.650416374206543, + "reward_std": 1.2309472560882568, + "rewards/accuracy_reward/mean": 3.9004158973693848, + "rewards/accuracy_reward/std": 3.6932566165924072, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 708.9375, + "completions/mean_terminated_length": 687.6825561523438, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.05740181268882175, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03643621504306793, + "learning_rate": 2.999673813250225e-06, + "loss": -0.0117, + "num_tokens": 16912750.0, + "reward": 3.686872959136963, + "reward_std": 1.5636723041534424, + "rewards/accuracy_reward/mean": 2.948591709136963, + "rewards/accuracy_reward/std": 3.7624149322509766, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 567.75, + "completions/mean_terminated_length": 567.75, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.05800604229607251, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.039692092686891556, + "learning_rate": 2.9996118137817615e-06, + "loss": -0.0195, + "num_tokens": 17059854.0, + "reward": 2.2745227813720703, + "reward_std": 1.8798869848251343, + "rewards/accuracy_reward/mean": 1.5245225429534912, + "rewards/accuracy_reward/std": 3.020784378051758, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1195.0, + "completions/max_terminated_length": 1195.0, + "completions/mean_length": 626.75, + "completions/mean_terminated_length": 626.75, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.05861027190332326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02376224659383297, + "learning_rate": 2.9995444241318047e-06, + "loss": -0.014, + "num_tokens": 17243150.0, + "reward": 3.1198887825012207, + "reward_std": 1.1079230308532715, + "rewards/accuracy_reward/mean": 2.3698887825012207, + "rewards/accuracy_reward/std": 3.3132996559143066, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 565.0, + "completions/mean_terminated_length": 565.0, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.059214501510574016, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03275354579091072, + "learning_rate": 2.9994716445695e-06, + "loss": 0.0029, + "num_tokens": 17448270.0, + "reward": 3.84015154838562, + "reward_std": 1.5348488092422485, + "rewards/accuracy_reward/mean": 3.090151786804199, + "rewards/accuracy_reward/std": 3.5904805660247803, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 583.828125, + "completions/mean_terminated_length": 583.828125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.05981873111782477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0383462980389595, + "learning_rate": 2.9993934753855196e-06, + "loss": -0.0208, + "num_tokens": 17603187.0, + "reward": 4.915212631225586, + "reward_std": 1.4375020265579224, + "rewards/accuracy_reward/mean": 4.165212631225586, + "rewards/accuracy_reward/std": 3.7364189624786377, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 536.125, + "completions/mean_terminated_length": 536.125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.06042296072507553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03664792701601982, + "learning_rate": 2.999309916892063e-06, + "loss": -0.0116, + "num_tokens": 17753675.0, + "reward": 3.938877820968628, + "reward_std": 1.7614582777023315, + "rewards/accuracy_reward/mean": 3.188878059387207, + "rewards/accuracy_reward/std": 3.448296070098877, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 626.78125, + "completions/mean_terminated_length": 626.78125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.06102719033232629, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03150085732340813, + "learning_rate": 2.999220969422851e-06, + "loss": 0.0312, + "num_tokens": 17903101.0, + "reward": 2.8824591636657715, + "reward_std": 1.920320749282837, + "rewards/accuracy_reward/mean": 2.1324591636657715, + "rewards/accuracy_reward/std": 3.2983109951019287, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 740.421875, + "completions/mean_terminated_length": 676.11474609375, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.06163141993957704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040707990527153015, + "learning_rate": 2.999126633333129e-06, + "loss": -0.0689, + "num_tokens": 18080856.0, + "reward": 3.355694055557251, + "reward_std": 2.2252180576324463, + "rewards/accuracy_reward/mean": 2.640850067138672, + "rewards/accuracy_reward/std": 3.6317341327667236, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1374.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 590.546875, + "completions/mean_terminated_length": 590.546875, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.062235649546827795, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02970181778073311, + "learning_rate": 2.9990269089996642e-06, + "loss": 0.0096, + "num_tokens": 18276971.0, + "reward": 5.710877418518066, + "reward_std": 1.290922999382019, + "rewards/accuracy_reward/mean": 4.960877418518066, + "rewards/accuracy_reward/std": 3.4954428672790527, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 521.25, + "completions/mean_terminated_length": 521.25, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.06283987915407856, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.039336930960416794, + "learning_rate": 2.9989217968207424e-06, + "loss": -0.0009, + "num_tokens": 18403259.0, + "reward": 3.080920457839966, + "reward_std": 1.813497543334961, + "rewards/accuracy_reward/mean": 2.330920696258545, + "rewards/accuracy_reward/std": 3.470106363296509, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 656.359375, + "completions/mean_terminated_length": 656.359375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.0634441087613293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03515372797846794, + "learning_rate": 2.998811297216169e-06, + "loss": 0.0052, + "num_tokens": 18542098.0, + "reward": 4.23433780670166, + "reward_std": 1.720855474472046, + "rewards/accuracy_reward/mean": 3.484337568283081, + "rewards/accuracy_reward/std": 3.6412739753723145, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 932.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 577.53125, + "completions/mean_terminated_length": 577.53125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.06404833836858007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.019109832122921944, + "learning_rate": 2.998695410627266e-06, + "loss": -0.0105, + "num_tokens": 18696388.0, + "reward": 2.657257080078125, + "reward_std": 1.0372073650360107, + "rewards/accuracy_reward/mean": 1.9072569608688354, + "rewards/accuracy_reward/std": 3.234621286392212, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 575.671875, + "completions/mean_terminated_length": 575.671875, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.06465256797583081, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03605879098176956, + "learning_rate": 2.9985741375168693e-06, + "loss": 0.005, + "num_tokens": 18853199.0, + "reward": 4.214863300323486, + "reward_std": 1.9177252054214478, + "rewards/accuracy_reward/mean": 3.4648633003234863, + "rewards/accuracy_reward/std": 3.751542329788208, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 701.984375, + "completions/mean_terminated_length": 701.984375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.06525679758308157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03872508555650711, + "learning_rate": 2.998447478369329e-06, + "loss": 0.008, + "num_tokens": 19016382.0, + "reward": 2.7385356426239014, + "reward_std": 1.9125361442565918, + "rewards/accuracy_reward/mean": 2.0041604042053223, + "rewards/accuracy_reward/std": 3.3130476474761963, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 601.1875, + "completions/mean_terminated_length": 601.1875, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.06586102719033232, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04055851697921753, + "learning_rate": 2.998315433690505e-06, + "loss": 0.0016, + "num_tokens": 19207098.0, + "reward": 4.762354373931885, + "reward_std": 2.730745792388916, + "rewards/accuracy_reward/mean": 4.012354373931885, + "rewards/accuracy_reward/std": 3.781899929046631, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 598.921875, + "completions/mean_terminated_length": 598.921875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.06646525679758308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054351482540369034, + "learning_rate": 2.998178004007769e-06, + "loss": 0.0102, + "num_tokens": 19369093.0, + "reward": 3.030040979385376, + "reward_std": 2.691342353820801, + "rewards/accuracy_reward/mean": 2.283946990966797, + "rewards/accuracy_reward/std": 3.4869415760040283, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 585.265625, + "completions/mean_terminated_length": 585.265625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.06706948640483383, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036852214485406876, + "learning_rate": 2.998035189869997e-06, + "loss": -0.008, + "num_tokens": 19522662.0, + "reward": 4.422540664672852, + "reward_std": 2.317607879638672, + "rewards/accuracy_reward/mean": 3.6842591762542725, + "rewards/accuracy_reward/std": 3.780526638031006, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1389.0, + "completions/max_terminated_length": 1389.0, + "completions/mean_length": 616.640625, + "completions/mean_terminated_length": 616.640625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.06767371601208459, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050860170274972916, + "learning_rate": 2.997886991847571e-06, + "loss": 0.0027, + "num_tokens": 19677487.0, + "reward": 5.876833915710449, + "reward_std": 2.3546602725982666, + "rewards/accuracy_reward/mean": 5.138552665710449, + "rewards/accuracy_reward/std": 3.3204729557037354, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 585.46875, + "completions/mean_terminated_length": 585.46875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.06827794561933535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047949906438589096, + "learning_rate": 2.9977334105323754e-06, + "loss": -0.0159, + "num_tokens": 19835469.0, + "reward": 5.829398155212402, + "reward_std": 2.776824474334717, + "rewards/accuracy_reward/mean": 5.0793986320495605, + "rewards/accuracy_reward/std": 3.488227367401123, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 909.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 547.046875, + "completions/mean_terminated_length": 547.046875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.0688821752265861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03340290114283562, + "learning_rate": 2.997574446537795e-06, + "loss": -0.0048, + "num_tokens": 20008320.0, + "reward": 4.16779899597168, + "reward_std": 1.982933759689331, + "rewards/accuracy_reward/mean": 3.4177989959716797, + "rewards/accuracy_reward/std": 3.677093744277954, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 618.234375, + "completions/mean_terminated_length": 618.234375, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.06948640483383686, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03750219568610191, + "learning_rate": 2.997410100498712e-06, + "loss": 0.0224, + "num_tokens": 20172623.0, + "reward": 1.7890222072601318, + "reward_std": 1.6246843338012695, + "rewards/accuracy_reward/mean": 1.0390222072601318, + "rewards/accuracy_reward/std": 2.5947983264923096, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 541.671875, + "completions/mean_terminated_length": 541.671875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.07009063444108761, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001629471778869629, + "learning_rate": 2.9972403730715045e-06, + "loss": -0.0017, + "num_tokens": 20322458.0, + "reward": 4.344324111938477, + "reward_std": 0.07085655629634857, + "rewards/accuracy_reward/mean": 3.5943238735198975, + "rewards/accuracy_reward/std": 3.6251604557037354, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 505.625, + "completions/mean_terminated_length": 505.625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.07069486404833837, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04146237298846245, + "learning_rate": 2.9970652649340417e-06, + "loss": 0.01, + "num_tokens": 20489570.0, + "reward": 3.817746162414551, + "reward_std": 2.8056769371032715, + "rewards/accuracy_reward/mean": 3.071652412414551, + "rewards/accuracy_reward/std": 3.6587417125701904, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 597.65625, + "completions/mean_terminated_length": 597.65625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.07129909365558912, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03837282210588455, + "learning_rate": 2.9968847767856848e-06, + "loss": 0.0101, + "num_tokens": 20630172.0, + "reward": 3.502493143081665, + "reward_std": 1.4465996026992798, + "rewards/accuracy_reward/mean": 2.764211893081665, + "rewards/accuracy_reward/std": 3.631552219390869, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 620.0, + "completions/mean_terminated_length": 620.0, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.07190332326283988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03699041157960892, + "learning_rate": 2.9966989093472808e-06, + "loss": -0.0004, + "num_tokens": 20801500.0, + "reward": 3.899690628051758, + "reward_std": 1.4449291229248047, + "rewards/accuracy_reward/mean": 3.149690628051758, + "rewards/accuracy_reward/std": 3.6203842163085938, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 539.84375, + "completions/mean_terminated_length": 539.84375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.07250755287009064, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02928418479859829, + "learning_rate": 2.9965076633611604e-06, + "loss": 0.0083, + "num_tokens": 20946818.0, + "reward": 4.805765628814697, + "reward_std": 1.3050713539123535, + "rewards/accuracy_reward/mean": 4.055765628814697, + "rewards/accuracy_reward/std": 3.721214532852173, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 512.59375, + "completions/mean_terminated_length": 512.59375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.07311178247734139, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030262263491749763, + "learning_rate": 2.9963110395911366e-06, + "loss": -0.0008, + "num_tokens": 21077608.0, + "reward": 2.9120912551879883, + "reward_std": 1.2134807109832764, + "rewards/accuracy_reward/mean": 2.1620912551879883, + "rewards/accuracy_reward/std": 3.2524282932281494, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 805.59375, + "completions/mean_terminated_length": 785.873046875, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "epoch": 0.07371601208459215, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029074551537632942, + "learning_rate": 2.9961090388225007e-06, + "loss": -0.0209, + "num_tokens": 21246206.0, + "reward": 1.9821910858154297, + "reward_std": 1.1410382986068726, + "rewards/accuracy_reward/mean": 1.2360974550247192, + "rewards/accuracy_reward/std": 2.7300002574920654, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 736.984375, + "completions/mean_terminated_length": 575.9824829101562, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.0743202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.019335592165589333, + "learning_rate": 2.9959016618620178e-06, + "loss": -0.0573, + "num_tokens": 21391517.0, + "reward": 5.927640438079834, + "reward_std": 0.7850548028945923, + "rewards/accuracy_reward/mean": 5.259671211242676, + "rewards/accuracy_reward/std": 3.4853179454803467, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.2359323352575302, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1423.0, + "completions/mean_length": 747.5625, + "completions/mean_terminated_length": 726.920654296875, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.07492447129909366, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03788154944777489, + "learning_rate": 2.9956889095379263e-06, + "loss": 0.0628, + "num_tokens": 21559377.0, + "reward": 3.923231840133667, + "reward_std": 1.0131011009216309, + "rewards/accuracy_reward/mean": 3.184950590133667, + "rewards/accuracy_reward/std": 3.7745285034179688, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 517.40625, + "completions/mean_terminated_length": 517.40625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.0755287009063444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043003156781196594, + "learning_rate": 2.995470782699932e-06, + "loss": -0.0086, + "num_tokens": 21762011.0, + "reward": 3.974968671798706, + "reward_std": 2.217547655105591, + "rewards/accuracy_reward/mean": 3.224968671798706, + "rewards/accuracy_reward/std": 3.675572395324707, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 657.53125, + "completions/mean_terminated_length": 657.53125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.07613293051359517, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.046992648392915726, + "learning_rate": 2.9952472822192074e-06, + "loss": -0.0597, + "num_tokens": 21975709.0, + "reward": 1.8844187259674072, + "reward_std": 2.0937459468841553, + "rewards/accuracy_reward/mean": 1.1402781009674072, + "rewards/accuracy_reward/std": 2.5815834999084473, + "rewards/tag_count_reward/mean": 0.744140625, + "rewards/tag_count_reward/std": 0.03471602126955986, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 533.546875, + "completions/mean_terminated_length": 509.5079650878906, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.07673716012084592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04719604179263115, + "learning_rate": 2.995018408988384e-06, + "loss": -0.0167, + "num_tokens": 22116464.0, + "reward": 6.534964084625244, + "reward_std": 2.2860116958618164, + "rewards/accuracy_reward/mean": 5.796682834625244, + "rewards/accuracy_reward/std": 3.1930532455444336, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1561.0, + "completions/max_terminated_length": 1561.0, + "completions/mean_length": 624.171875, + "completions/mean_terminated_length": 624.171875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.07734138972809668, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04340837150812149, + "learning_rate": 2.994784163921554e-06, + "loss": -0.064, + "num_tokens": 22321563.0, + "reward": 2.7456860542297363, + "reward_std": 1.5769238471984863, + "rewards/accuracy_reward/mean": 1.9956859350204468, + "rewards/accuracy_reward/std": 3.3875577449798584, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.0, + "completions/max_terminated_length": 1622.0, + "completions/mean_length": 536.03125, + "completions/mean_terminated_length": 536.03125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.07794561933534744, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03713426738977432, + "learning_rate": 2.994544547954263e-06, + "loss": -0.0046, + "num_tokens": 22461037.0, + "reward": 3.3176939487457275, + "reward_std": 1.4524877071380615, + "rewards/accuracy_reward/mean": 2.5676939487457275, + "rewards/accuracy_reward/std": 3.5337941646575928, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 526.921875, + "completions/mean_terminated_length": 526.921875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.07854984894259819, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030154595151543617, + "learning_rate": 2.994299562043507e-06, + "loss": 0.0084, + "num_tokens": 22615800.0, + "reward": 5.9232587814331055, + "reward_std": 0.8902535438537598, + "rewards/accuracy_reward/mean": 5.1732587814331055, + "rewards/accuracy_reward/std": 3.293062686920166, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 537.640625, + "completions/mean_terminated_length": 537.640625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.07915407854984895, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.037243783473968506, + "learning_rate": 2.994049207167729e-06, + "loss": -0.0008, + "num_tokens": 22783873.0, + "reward": 4.527503967285156, + "reward_std": 1.5719330310821533, + "rewards/accuracy_reward/mean": 3.7775039672851562, + "rewards/accuracy_reward/std": 4.025949001312256, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 557.96875, + "completions/mean_terminated_length": 557.96875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.0797583081570997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04677026346325874, + "learning_rate": 2.993793484326816e-06, + "loss": 0.0137, + "num_tokens": 22960543.0, + "reward": 4.80448055267334, + "reward_std": 2.4430527687072754, + "rewards/accuracy_reward/mean": 4.06619930267334, + "rewards/accuracy_reward/std": 3.783313751220703, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 604.546875, + "completions/mean_terminated_length": 604.546875, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "epoch": 0.08036253776435046, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04621279984712601, + "learning_rate": 2.9935323945420924e-06, + "loss": -0.0068, + "num_tokens": 23144498.0, + "reward": 6.337588310241699, + "reward_std": 2.349282741546631, + "rewards/accuracy_reward/mean": 5.591494560241699, + "rewards/accuracy_reward/std": 3.2046494483947754, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 623.875, + "completions/mean_terminated_length": 623.875, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.0809667673716012, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03257531672716141, + "learning_rate": 2.9932659388563182e-06, + "loss": 0.0006, + "num_tokens": 23348714.0, + "reward": 3.7420053482055664, + "reward_std": 1.4128153324127197, + "rewards/accuracy_reward/mean": 2.9920053482055664, + "rewards/accuracy_reward/std": 3.6801364421844482, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 565.265625, + "completions/mean_terminated_length": 565.265625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.08157099697885196, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03282265365123749, + "learning_rate": 2.9929941183336853e-06, + "loss": -0.0055, + "num_tokens": 23512699.0, + "reward": 5.378772258758545, + "reward_std": 0.9627033472061157, + "rewards/accuracy_reward/mean": 4.628771781921387, + "rewards/accuracy_reward/std": 3.5052218437194824, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1509.0, + "completions/mean_length": 669.78125, + "completions/mean_terminated_length": 647.90478515625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.08217522658610273, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03152112290263176, + "learning_rate": 2.99271693405981e-06, + "loss": -0.034, + "num_tokens": 23686093.0, + "reward": 5.7472405433654785, + "reward_std": 1.0193448066711426, + "rewards/accuracy_reward/mean": 5.0089592933654785, + "rewards/accuracy_reward/std": 3.3930928707122803, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 518.125, + "completions/mean_terminated_length": 518.125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.08277945619335347, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04454374685883522, + "learning_rate": 2.992434387141732e-06, + "loss": 0.0108, + "num_tokens": 23835605.0, + "reward": 5.147139549255371, + "reward_std": 2.1848039627075195, + "rewards/accuracy_reward/mean": 4.397139549255371, + "rewards/accuracy_reward/std": 3.5768933296203613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 621.234375, + "completions/mean_terminated_length": 621.234375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.08338368580060423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03592858090996742, + "learning_rate": 2.992146478707908e-06, + "loss": -0.0216, + "num_tokens": 23976484.0, + "reward": 4.95212459564209, + "reward_std": 1.8753583431243896, + "rewards/accuracy_reward/mean": 4.202123641967773, + "rewards/accuracy_reward/std": 3.675179958343506, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 617.078125, + "completions/mean_terminated_length": 617.078125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.08398791540785498, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042267728596925735, + "learning_rate": 2.9918532099082104e-06, + "loss": 0.006, + "num_tokens": 24147657.0, + "reward": 5.237790107727051, + "reward_std": 2.154233932495117, + "rewards/accuracy_reward/mean": 4.491696357727051, + "rewards/accuracy_reward/std": 3.6260814666748047, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1007.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 547.546875, + "completions/mean_terminated_length": 547.546875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.08459214501510574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04234826937317848, + "learning_rate": 2.991554581913916e-06, + "loss": -0.0021, + "num_tokens": 24286908.0, + "reward": 6.699317932128906, + "reward_std": 1.8446557521820068, + "rewards/accuracy_reward/mean": 5.949317932128906, + "rewards/accuracy_reward/std": 2.92826771736145, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 601.1875, + "completions/mean_terminated_length": 601.1875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.08519637462235649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03111099824309349, + "learning_rate": 2.991250595917709e-06, + "loss": 0.0162, + "num_tokens": 24426408.0, + "reward": 6.096480369567871, + "reward_std": 1.242748498916626, + "rewards/accuracy_reward/mean": 5.346480369567871, + "rewards/accuracy_reward/std": 3.3712005615234375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 613.625, + "completions/mean_terminated_length": 613.625, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.08580060422960725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04745940491557121, + "learning_rate": 2.9909412531336708e-06, + "loss": 0.0306, + "num_tokens": 24649424.0, + "reward": 5.765105247497559, + "reward_std": 2.1170263290405273, + "rewards/accuracy_reward/mean": 5.015105247497559, + "rewards/accuracy_reward/std": 3.655832052230835, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 574.40625, + "completions/mean_terminated_length": 574.40625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.086404833836858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03351614996790886, + "learning_rate": 2.990626554797279e-06, + "loss": -0.0028, + "num_tokens": 24826314.0, + "reward": 3.2327804565429688, + "reward_std": 1.3242884874343872, + "rewards/accuracy_reward/mean": 2.4827804565429688, + "rewards/accuracy_reward/std": 3.3218941688537598, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 656.6875, + "completions/mean_terminated_length": 656.6875, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "epoch": 0.08700906344410876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03496231511235237, + "learning_rate": 2.990306502165398e-06, + "loss": -0.0177, + "num_tokens": 24986934.0, + "reward": 4.825960159301758, + "reward_std": 1.8262863159179688, + "rewards/accuracy_reward/mean": 4.075960159301758, + "rewards/accuracy_reward/std": 3.7395763397216797, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 606.125, + "completions/mean_terminated_length": 606.125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.08761329305135952, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0317770279943943, + "learning_rate": 2.9899810965162803e-06, + "loss": -0.0068, + "num_tokens": 25136142.0, + "reward": 3.730211019515991, + "reward_std": 1.3155345916748047, + "rewards/accuracy_reward/mean": 2.980210781097412, + "rewards/accuracy_reward/std": 3.9953458309173584, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 587.859375, + "completions/mean_terminated_length": 587.859375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.08821752265861027, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03665482625365257, + "learning_rate": 2.989650339149554e-06, + "loss": -0.0108, + "num_tokens": 25376229.0, + "reward": 3.3527684211730957, + "reward_std": 1.4320611953735352, + "rewards/accuracy_reward/mean": 2.6027681827545166, + "rewards/accuracy_reward/std": 3.504348039627075, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 529.921875, + "completions/mean_terminated_length": 505.825439453125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.08882175226586103, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.035745058208703995, + "learning_rate": 2.989314231386223e-06, + "loss": -0.0291, + "num_tokens": 25505376.0, + "reward": 5.464145183563232, + "reward_std": 2.0266621112823486, + "rewards/accuracy_reward/mean": 4.725864410400391, + "rewards/accuracy_reward/std": 3.637624740600586, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 473.65625, + "completions/mean_terminated_length": 473.65625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.08942598187311178, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028087724931538105, + "learning_rate": 2.9889727745686605e-06, + "loss": 0.0014, + "num_tokens": 25657818.0, + "reward": 2.580484390258789, + "reward_std": 0.09604136645793915, + "rewards/accuracy_reward/mean": 1.8343905210494995, + "rewards/accuracy_reward/std": 3.2413315773010254, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 577.5, + "completions/mean_terminated_length": 577.5, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.09003021148036254, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02856281027197838, + "learning_rate": 2.988625970060602e-06, + "loss": 0.0098, + "num_tokens": 25836074.0, + "reward": 3.4170620441436768, + "reward_std": 0.9806593060493469, + "rewards/accuracy_reward/mean": 2.667062282562256, + "rewards/accuracy_reward/std": 3.589430570602417, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.0, + "completions/max_terminated_length": 1492.0, + "completions/mean_length": 642.390625, + "completions/mean_terminated_length": 642.390625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.09063444108761329, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04594828188419342, + "learning_rate": 2.988273819247141e-06, + "loss": 0.0465, + "num_tokens": 26085731.0, + "reward": 4.763190269470215, + "reward_std": 2.3872995376586914, + "rewards/accuracy_reward/mean": 4.013190746307373, + "rewards/accuracy_reward/std": 3.7493700981140137, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 655.421875, + "completions/mean_terminated_length": 633.3175048828125, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.09123867069486405, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.012383284978568554, + "learning_rate": 2.987916323534725e-06, + "loss": -0.0529, + "num_tokens": 26250606.0, + "reward": 4.204806804656982, + "reward_std": 1.0384272336959839, + "rewards/accuracy_reward/mean": 3.4665255546569824, + "rewards/accuracy_reward/std": 3.753434419631958, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 470.0, + "completions/mean_terminated_length": 470.0, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.09184290030211481, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.020334305241703987, + "learning_rate": 2.9875534843511466e-06, + "loss": 0.0068, + "num_tokens": 26366670.0, + "reward": 4.133006572723389, + "reward_std": 0.6694819331169128, + "rewards/accuracy_reward/mean": 3.3830065727233887, + "rewards/accuracy_reward/std": 3.6336708068847656, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 582.46875, + "completions/mean_terminated_length": 582.46875, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.09244712990936556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036255452781915665, + "learning_rate": 2.987185303145541e-06, + "loss": -0.0149, + "num_tokens": 26538028.0, + "reward": 6.886981010437012, + "reward_std": 2.0785937309265137, + "rewards/accuracy_reward/mean": 6.13698148727417, + "rewards/accuracy_reward/std": 2.8187286853790283, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 534.90625, + "completions/mean_terminated_length": 534.90625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.09305135951661632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03110470622777939, + "learning_rate": 2.986811781388378e-06, + "loss": 0.0137, + "num_tokens": 26722550.0, + "reward": 5.839145183563232, + "reward_std": 1.2854399681091309, + "rewards/accuracy_reward/mean": 5.089144706726074, + "rewards/accuracy_reward/std": 3.2934350967407227, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 677.71875, + "completions/mean_terminated_length": 677.71875, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.09365558912386707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056508537381887436, + "learning_rate": 2.9864329205714557e-06, + "loss": 0.0687, + "num_tokens": 26878292.0, + "reward": 5.548691272735596, + "reward_std": 3.222064256668091, + "rewards/accuracy_reward/mean": 4.798691272735596, + "rewards/accuracy_reward/std": 3.596031427383423, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 612.34375, + "completions/mean_terminated_length": 612.34375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.09425981873111783, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03308256343007088, + "learning_rate": 2.986048722207899e-06, + "loss": -0.0006, + "num_tokens": 27060026.0, + "reward": 7.597167015075684, + "reward_std": 1.3084590435028076, + "rewards/accuracy_reward/mean": 6.847167015075684, + "rewards/accuracy_reward/std": 2.009552478790283, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 512.3125, + "completions/mean_terminated_length": 512.3125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.09486404833836858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03801378980278969, + "learning_rate": 2.9856591878321463e-06, + "loss": -0.0123, + "num_tokens": 27216062.0, + "reward": 5.401611328125, + "reward_std": 1.9578251838684082, + "rewards/accuracy_reward/mean": 4.651611328125, + "rewards/accuracy_reward/std": 3.588651180267334, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 510.96875, + "completions/mean_terminated_length": 510.96875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.09546827794561934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03938839212059975, + "learning_rate": 2.9852643189999507e-06, + "loss": 0.0616, + "num_tokens": 27432396.0, + "reward": 4.103596210479736, + "reward_std": 2.38344144821167, + "rewards/accuracy_reward/mean": 3.3535959720611572, + "rewards/accuracy_reward/std": 3.714130401611328, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 501.828125, + "completions/mean_terminated_length": 501.828125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.09607250755287008, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03804798424243927, + "learning_rate": 2.9848641172883696e-06, + "loss": -0.0261, + "num_tokens": 27591265.0, + "reward": 4.925178050994873, + "reward_std": 1.7089773416519165, + "rewards/accuracy_reward/mean": 4.175178050994873, + "rewards/accuracy_reward/std": 3.711771249771118, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 565.90625, + "completions/mean_terminated_length": 565.90625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.09667673716012085, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.017457257956266403, + "learning_rate": 2.984458584295757e-06, + "loss": -0.0067, + "num_tokens": 27744315.0, + "reward": 6.0564985275268555, + "reward_std": 0.9701351523399353, + "rewards/accuracy_reward/mean": 5.3064985275268555, + "rewards/accuracy_reward/std": 3.3694186210632324, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 598.078125, + "completions/mean_terminated_length": 598.078125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.09728096676737161, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04373330995440483, + "learning_rate": 2.984047721641763e-06, + "loss": 0.0175, + "num_tokens": 27910464.0, + "reward": 5.1542768478393555, + "reward_std": 2.2781033515930176, + "rewards/accuracy_reward/mean": 4.4042768478393555, + "rewards/accuracy_reward/std": 3.690969944000244, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 633.109375, + "completions/mean_terminated_length": 633.109375, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.09788519637462235, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028211276978254318, + "learning_rate": 2.9836315309673204e-06, + "loss": 0.0162, + "num_tokens": 28236039.0, + "reward": 3.8583388328552246, + "reward_std": 0.9515811800956726, + "rewards/accuracy_reward/mean": 3.1083390712738037, + "rewards/accuracy_reward/std": 3.701545238494873, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 559.578125, + "completions/mean_terminated_length": 559.578125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.09848942598187312, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.031100234016776085, + "learning_rate": 2.9832100139346436e-06, + "loss": -0.0067, + "num_tokens": 28363692.0, + "reward": 6.122005462646484, + "reward_std": 1.1299099922180176, + "rewards/accuracy_reward/mean": 5.379818439483643, + "rewards/accuracy_reward/std": 3.2641184329986572, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 601.96875, + "completions/mean_terminated_length": 601.96875, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.09909365558912386, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047658126801252365, + "learning_rate": 2.9827831722272195e-06, + "loss": -0.0062, + "num_tokens": 28544570.0, + "reward": 6.783688545227051, + "reward_std": 2.9128644466400146, + "rewards/accuracy_reward/mean": 6.033688545227051, + "rewards/accuracy_reward/std": 2.9300737380981445, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 643.34375, + "completions/mean_terminated_length": 643.34375, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.09969788519637462, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.041478920727968216, + "learning_rate": 2.9823510075498005e-06, + "loss": 0.0148, + "num_tokens": 28695696.0, + "reward": 4.867195129394531, + "reward_std": 2.0420801639556885, + "rewards/accuracy_reward/mean": 4.117195129394531, + "rewards/accuracy_reward/std": 3.681602954864502, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 529.640625, + "completions/mean_terminated_length": 529.640625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.10030211480362537, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04907475784420967, + "learning_rate": 2.9819135216283977e-06, + "loss": 0.008, + "num_tokens": 28870121.0, + "reward": 2.828136920928955, + "reward_std": 2.3642661571502686, + "rewards/accuracy_reward/mean": 2.078137159347534, + "rewards/accuracy_reward/std": 3.348461151123047, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 541.296875, + "completions/mean_terminated_length": 541.296875, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.10090634441087613, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05336647480726242, + "learning_rate": 2.981470716210276e-06, + "loss": -0.0001, + "num_tokens": 29035196.0, + "reward": 5.844951152801514, + "reward_std": 3.184624195098877, + "rewards/accuracy_reward/mean": 5.094951152801514, + "rewards/accuracy_reward/std": 3.4508750438690186, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1231.0, + "completions/max_terminated_length": 1231.0, + "completions/mean_length": 582.875, + "completions/mean_terminated_length": 582.875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.1015105740181269, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04177451506257057, + "learning_rate": 2.981022593063946e-06, + "loss": 0.0343, + "num_tokens": 29198836.0, + "reward": 6.027894973754883, + "reward_std": 1.7318048477172852, + "rewards/accuracy_reward/mean": 5.277894496917725, + "rewards/accuracy_reward/std": 3.2910361289978027, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 637.390625, + "completions/mean_terminated_length": 615.0000610351562, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.10211480362537764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03551078215241432, + "learning_rate": 2.9805691539791537e-06, + "loss": -0.0021, + "num_tokens": 29356445.0, + "reward": 4.022823333740234, + "reward_std": 1.4553248882293701, + "rewards/accuracy_reward/mean": 3.2884488105773926, + "rewards/accuracy_reward/std": 3.67754864692688, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 575.765625, + "completions/mean_terminated_length": 575.765625, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.1027190332326284, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030138906091451645, + "learning_rate": 2.9801104007668796e-06, + "loss": -0.0017, + "num_tokens": 29482526.0, + "reward": 2.533665418624878, + "reward_std": 1.1923483610153198, + "rewards/accuracy_reward/mean": 1.7875715494155884, + "rewards/accuracy_reward/std": 3.4444026947021484, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 665.8125, + "completions/mean_terminated_length": 643.873046875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.10332326283987915, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018481051549315453, + "learning_rate": 2.9796463352593275e-06, + "loss": -0.0344, + "num_tokens": 29667682.0, + "reward": 4.158517360687256, + "reward_std": 0.8212066888809204, + "rewards/accuracy_reward/mean": 3.4202358722686768, + "rewards/accuracy_reward/std": 3.7295029163360596, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 572.75, + "completions/mean_terminated_length": 572.75, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.10392749244712991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04474496841430664, + "learning_rate": 2.979176959309916e-06, + "loss": -0.0017, + "num_tokens": 29854114.0, + "reward": 4.017061233520508, + "reward_std": 1.9013409614562988, + "rewards/accuracy_reward/mean": 3.2670609951019287, + "rewards/accuracy_reward/std": 3.673776149749756, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 500.984375, + "completions/mean_terminated_length": 500.984375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.10453172205438066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02617737278342247, + "learning_rate": 2.9787022747932747e-06, + "loss": -0.0086, + "num_tokens": 30027489.0, + "reward": 4.594332218170166, + "reward_std": 0.7681067585945129, + "rewards/accuracy_reward/mean": 3.844331741333008, + "rewards/accuracy_reward/std": 3.9041366577148438, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 585.3125, + "completions/mean_terminated_length": 585.3125, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.10513595166163142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05533432587981224, + "learning_rate": 2.978222283605234e-06, + "loss": -0.0043, + "num_tokens": 30188101.0, + "reward": 5.581939697265625, + "reward_std": 3.074195146560669, + "rewards/accuracy_reward/mean": 4.831939697265625, + "rewards/accuracy_reward/std": 3.6301331520080566, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 673.46875, + "completions/mean_terminated_length": 673.46875, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.10574018126888217, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04487411305308342, + "learning_rate": 2.9777369876628197e-06, + "loss": -0.001, + "num_tokens": 30352051.0, + "reward": 5.06648588180542, + "reward_std": 1.8553681373596191, + "rewards/accuracy_reward/mean": 4.316486358642578, + "rewards/accuracy_reward/std": 3.656301975250244, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 560.828125, + "completions/mean_terminated_length": 560.828125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.10634441087613293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04190770164132118, + "learning_rate": 2.977246388904243e-06, + "loss": -0.005, + "num_tokens": 30579256.0, + "reward": 6.885400772094727, + "reward_std": 1.654840111732483, + "rewards/accuracy_reward/mean": 6.135400772094727, + "rewards/accuracy_reward/std": 2.711042881011963, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 536.703125, + "completions/mean_terminated_length": 536.703125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.10694864048338369, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03939886391162872, + "learning_rate": 2.9767504892888945e-06, + "loss": 0.0077, + "num_tokens": 30733141.0, + "reward": 2.9799230098724365, + "reward_std": 1.60249924659729, + "rewards/accuracy_reward/mean": 2.2299230098724365, + "rewards/accuracy_reward/std": 3.3179121017456055, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 536.375, + "completions/mean_terminated_length": 536.375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.10755287009063444, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04744384065270424, + "learning_rate": 2.9762492907973344e-06, + "loss": 0.0162, + "num_tokens": 30927133.0, + "reward": 4.570286750793457, + "reward_std": 2.5091118812561035, + "rewards/accuracy_reward/mean": 3.824193000793457, + "rewards/accuracy_reward/std": 3.6675126552581787, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 573.671875, + "completions/mean_terminated_length": 573.671875, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.1081570996978852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02367950603365898, + "learning_rate": 2.975742795431288e-06, + "loss": -0.006, + "num_tokens": 31062328.0, + "reward": 4.863864898681641, + "reward_std": 0.9326555728912354, + "rewards/accuracy_reward/mean": 4.113864421844482, + "rewards/accuracy_reward/std": 3.6435093879699707, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 904.0, + "completions/mean_length": 643.234375, + "completions/mean_terminated_length": 620.9365234375, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.10876132930513595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03243964910507202, + "learning_rate": 2.9752310052136353e-06, + "loss": -0.0176, + "num_tokens": 31223351.0, + "reward": 2.396878242492676, + "reward_std": 1.4429471492767334, + "rewards/accuracy_reward/mean": 1.6585968732833862, + "rewards/accuracy_reward/std": 2.95205020904541, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 695.421875, + "completions/mean_terminated_length": 695.421875, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.10936555891238671, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.042229264974594116, + "learning_rate": 2.9747139221884013e-06, + "loss": -0.0273, + "num_tokens": 31379874.0, + "reward": 2.6096439361572266, + "reward_std": 2.3105030059814453, + "rewards/accuracy_reward/mean": 1.8596439361572266, + "rewards/accuracy_reward/std": 3.116105556488037, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 628.890625, + "completions/mean_terminated_length": 628.890625, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.10996978851963746, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03919349983334541, + "learning_rate": 2.9741915484207523e-06, + "loss": 0.0068, + "num_tokens": 31516363.0, + "reward": 7.38333797454834, + "reward_std": 1.9760456085205078, + "rewards/accuracy_reward/mean": 6.63333797454834, + "rewards/accuracy_reward/std": 2.3431828022003174, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 779.625, + "completions/mean_terminated_length": 779.625, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.11057401812688822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029253434389829636, + "learning_rate": 2.9736638859969834e-06, + "loss": -0.0102, + "num_tokens": 31678755.0, + "reward": 2.565757989883423, + "reward_std": 1.6946951150894165, + "rewards/accuracy_reward/mean": 1.8157579898834229, + "rewards/accuracy_reward/std": 2.9538414478302, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 544.53125, + "completions/mean_terminated_length": 544.53125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.11117824773413898, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04502902179956436, + "learning_rate": 2.9731309370245134e-06, + "loss": 0.0117, + "num_tokens": 31798773.0, + "reward": 5.279115676879883, + "reward_std": 2.0504157543182373, + "rewards/accuracy_reward/mean": 4.529115676879883, + "rewards/accuracy_reward/std": 3.6406078338623047, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 652.234375, + "completions/mean_terminated_length": 652.234375, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.11178247734138973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024879176169633865, + "learning_rate": 2.972592703631872e-06, + "loss": 0.0015, + "num_tokens": 31964836.0, + "reward": 3.883316993713379, + "reward_std": 1.1029592752456665, + "rewards/accuracy_reward/mean": 3.133317470550537, + "rewards/accuracy_reward/std": 3.721062421798706, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 627.15625, + "completions/mean_terminated_length": 627.15625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.11238670694864049, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04818617179989815, + "learning_rate": 2.9720491879686994e-06, + "loss": 0.0119, + "num_tokens": 32142078.0, + "reward": 3.656161308288574, + "reward_std": 2.130336284637451, + "rewards/accuracy_reward/mean": 2.906161308288574, + "rewards/accuracy_reward/std": 3.4181158542633057, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 541.671875, + "completions/mean_terminated_length": 541.671875, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.11299093655589124, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.012896990403532982, + "learning_rate": 2.9715003922057274e-06, + "loss": 0.0039, + "num_tokens": 32302857.0, + "reward": 4.31449031829834, + "reward_std": 0.4776504337787628, + "rewards/accuracy_reward/mean": 3.56449031829834, + "rewards/accuracy_reward/std": 3.7072644233703613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 578.703125, + "completions/mean_terminated_length": 578.703125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.113595166163142, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026343775913119316, + "learning_rate": 2.970946318534779e-06, + "loss": 0.0009, + "num_tokens": 32441670.0, + "reward": 4.108141899108887, + "reward_std": 0.9442707300186157, + "rewards/accuracy_reward/mean": 3.3620481491088867, + "rewards/accuracy_reward/std": 3.705714464187622, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 458.625, + "completions/mean_terminated_length": 458.625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.11419939577039274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03570733591914177, + "learning_rate": 2.970386969168754e-06, + "loss": -0.0027, + "num_tokens": 32552254.0, + "reward": 5.121797561645508, + "reward_std": 1.5537062883377075, + "rewards/accuracy_reward/mean": 4.371797561645508, + "rewards/accuracy_reward/std": 3.509557008743286, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1395.0, + "completions/mean_length": 645.328125, + "completions/mean_terminated_length": 623.0635375976562, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.1148036253776435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030932540073990822, + "learning_rate": 2.9698223463416256e-06, + "loss": 0.0019, + "num_tokens": 32722435.0, + "reward": 6.424796104431152, + "reward_std": 1.1542396545410156, + "rewards/accuracy_reward/mean": 5.686514854431152, + "rewards/accuracy_reward/std": 3.180075168609619, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 599.1875, + "completions/mean_terminated_length": 599.1875, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.11540785498489425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04775106906890869, + "learning_rate": 2.9692524523084263e-06, + "loss": 0.0061, + "num_tokens": 32903327.0, + "reward": 5.063967227935791, + "reward_std": 2.368704319000244, + "rewards/accuracy_reward/mean": 4.313967227935791, + "rewards/accuracy_reward/std": 3.6924564838409424, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 485.875, + "completions/mean_terminated_length": 485.875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.11601208459214502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038100190460681915, + "learning_rate": 2.968677289345242e-06, + "loss": 0.0224, + "num_tokens": 33025703.0, + "reward": 6.7899041175842285, + "reward_std": 1.9709413051605225, + "rewards/accuracy_reward/mean": 6.04380989074707, + "rewards/accuracy_reward/std": 2.8702237606048584, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 583.078125, + "completions/mean_terminated_length": 583.078125, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.11661631419939578, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04310370609164238, + "learning_rate": 2.968096859749202e-06, + "loss": 0.0247, + "num_tokens": 33169660.0, + "reward": 4.316083908081055, + "reward_std": 2.2810261249542236, + "rewards/accuracy_reward/mean": 3.5660839080810547, + "rewards/accuracy_reward/std": 3.6256661415100098, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 588.765625, + "completions/mean_terminated_length": 588.765625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.11722054380664652, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01270938292145729, + "learning_rate": 2.96751116583847e-06, + "loss": -0.0001, + "num_tokens": 33348061.0, + "reward": 1.0474390983581543, + "reward_std": 0.4942365288734436, + "rewards/accuracy_reward/mean": 0.2974390387535095, + "rewards/accuracy_reward/std": 0.942270040512085, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1144.0, + "completions/max_terminated_length": 1144.0, + "completions/mean_length": 631.15625, + "completions/mean_terminated_length": 631.15625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.11782477341389729, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.012887722812592983, + "learning_rate": 2.9669202099522343e-06, + "loss": -0.0029, + "num_tokens": 33495351.0, + "reward": 4.451267719268799, + "reward_std": 0.5255235433578491, + "rewards/accuracy_reward/mean": 3.701268196105957, + "rewards/accuracy_reward/std": 3.666463613510132, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 523.515625, + "completions/mean_terminated_length": 523.515625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.11842900302114803, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03175343573093414, + "learning_rate": 2.966323994450699e-06, + "loss": -0.0153, + "num_tokens": 33640776.0, + "reward": 4.5680317878723145, + "reward_std": 1.5852816104888916, + "rewards/accuracy_reward/mean": 3.8180317878723145, + "rewards/accuracy_reward/std": 3.7304019927978516, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 521.671875, + "completions/mean_terminated_length": 521.671875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.1190332326283988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053801316767930984, + "learning_rate": 2.9657225217150746e-06, + "loss": 0.061, + "num_tokens": 33784051.0, + "reward": 3.688002586364746, + "reward_std": 3.3793344497680664, + "rewards/accuracy_reward/mean": 2.938002586364746, + "rewards/accuracy_reward/std": 3.705449342727661, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 555.046875, + "completions/mean_terminated_length": 555.046875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.11963746223564954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0518251471221447, + "learning_rate": 2.9651157941475685e-06, + "loss": -0.0066, + "num_tokens": 33935782.0, + "reward": 5.625888824462891, + "reward_std": 2.9509384632110596, + "rewards/accuracy_reward/mean": 4.875888824462891, + "rewards/accuracy_reward/std": 3.5264320373535156, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 553.359375, + "completions/mean_terminated_length": 553.359375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.1202416918429003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03579654172062874, + "learning_rate": 2.964503814171375e-06, + "loss": -0.0082, + "num_tokens": 34076413.0, + "reward": 7.407468795776367, + "reward_std": 1.8732445240020752, + "rewards/accuracy_reward/mean": 6.657468795776367, + "rewards/accuracy_reward/std": 2.2388381958007812, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 488.125, + "completions/mean_terminated_length": 488.125, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.12084592145015106, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001330305589362979, + "learning_rate": 2.9638865842306654e-06, + "loss": -0.001, + "num_tokens": 34226389.0, + "reward": 6.241544723510742, + "reward_std": 0.06457770615816116, + "rewards/accuracy_reward/mean": 5.491544723510742, + "rewards/accuracy_reward/std": 3.1994805335998535, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 651.296875, + "completions/mean_terminated_length": 651.296875, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.12145015105740181, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03955275937914848, + "learning_rate": 2.96326410679058e-06, + "loss": -0.0068, + "num_tokens": 34385944.0, + "reward": 3.443657875061035, + "reward_std": 1.494640588760376, + "rewards/accuracy_reward/mean": 2.693657875061035, + "rewards/accuracy_reward/std": 3.494710683822632, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 599.59375, + "completions/mean_terminated_length": 599.59375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.12205438066465257, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03003774769604206, + "learning_rate": 2.962636384337216e-06, + "loss": -0.0027, + "num_tokens": 34533742.0, + "reward": 3.962493419647217, + "reward_std": 0.9422852993011475, + "rewards/accuracy_reward/mean": 3.212493658065796, + "rewards/accuracy_reward/std": 3.6081860065460205, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 551.65625, + "completions/mean_terminated_length": 551.65625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.12265861027190332, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03575572744011879, + "learning_rate": 2.9620034193776187e-06, + "loss": 0.0156, + "num_tokens": 34683016.0, + "reward": 3.8212876319885254, + "reward_std": 0.9443848729133606, + "rewards/accuracy_reward/mean": 3.0712873935699463, + "rewards/accuracy_reward/std": 3.6591830253601074, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 440.8125, + "completions/mean_terminated_length": 440.8125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.12326283987915408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049208831042051315, + "learning_rate": 2.9613652144397706e-06, + "loss": 0.0192, + "num_tokens": 34895996.0, + "reward": 5.064154624938965, + "reward_std": 2.643822193145752, + "rewards/accuracy_reward/mean": 4.314154624938965, + "rewards/accuracy_reward/std": 3.571460723876953, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 622.40625, + "completions/mean_terminated_length": 599.77783203125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.12386706948640483, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05079617723822594, + "learning_rate": 2.9607217720725836e-06, + "loss": 0.0001, + "num_tokens": 35059622.0, + "reward": 4.269966125488281, + "reward_std": 2.9154129028320312, + "rewards/accuracy_reward/mean": 3.5316851139068604, + "rewards/accuracy_reward/std": 3.7085416316986084, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 646.265625, + "completions/mean_terminated_length": 646.265625, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.12447129909365559, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030898567289114, + "learning_rate": 2.9600730948458863e-06, + "loss": 0.0015, + "num_tokens": 35271575.0, + "reward": 1.2115507125854492, + "reward_std": 1.2953379154205322, + "rewards/accuracy_reward/mean": 0.4615507125854492, + "rewards/accuracy_reward/std": 1.816009759902954, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 570.390625, + "completions/mean_terminated_length": 570.390625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.12507552870090635, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04192858561873436, + "learning_rate": 2.9594191853504137e-06, + "loss": 0.0161, + "num_tokens": 35491744.0, + "reward": 3.8250627517700195, + "reward_std": 1.8091981410980225, + "rewards/accuracy_reward/mean": 3.0750627517700195, + "rewards/accuracy_reward/std": 3.7301464080810547, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1935.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 727.15625, + "completions/mean_terminated_length": 727.15625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.1256797583081571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03865790367126465, + "learning_rate": 2.9587600461978e-06, + "loss": 0.0019, + "num_tokens": 35676746.0, + "reward": 3.200573205947876, + "reward_std": 1.8555512428283691, + "rewards/accuracy_reward/mean": 2.450573444366455, + "rewards/accuracy_reward/std": 3.5232386589050293, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1555.0, + "completions/max_terminated_length": 1555.0, + "completions/mean_length": 715.046875, + "completions/mean_terminated_length": 715.046875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.12628398791540785, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02623041532933712, + "learning_rate": 2.958095680020565e-06, + "loss": -0.0037, + "num_tokens": 35898061.0, + "reward": 2.146787405014038, + "reward_std": 0.9097850918769836, + "rewards/accuracy_reward/mean": 1.3967875242233276, + "rewards/accuracy_reward/std": 2.9466328620910645, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1549.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 581.8125, + "completions/mean_terminated_length": 581.8125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.1268882175226586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04409867525100708, + "learning_rate": 2.957426089472103e-06, + "loss": -0.0189, + "num_tokens": 36062065.0, + "reward": 5.447445392608643, + "reward_std": 2.1203970909118652, + "rewards/accuracy_reward/mean": 4.697445392608643, + "rewards/accuracy_reward/std": 3.6011736392974854, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 582.265625, + "completions/mean_terminated_length": 582.265625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.12749244712990937, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01667940244078636, + "learning_rate": 2.9567512772266774e-06, + "loss": 0.0071, + "num_tokens": 36226146.0, + "reward": 4.21352481842041, + "reward_std": 0.6418384909629822, + "rewards/accuracy_reward/mean": 3.46352481842041, + "rewards/accuracy_reward/std": 3.7108747959136963, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 571.375, + "completions/mean_terminated_length": 571.375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.12809667673716013, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018495850265026093, + "learning_rate": 2.9560712459794023e-06, + "loss": -0.0349, + "num_tokens": 36362090.0, + "reward": 6.000076770782471, + "reward_std": 0.7318800687789917, + "rewards/accuracy_reward/mean": 5.250076770782471, + "rewards/accuracy_reward/std": 3.3938655853271484, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 532.59375, + "completions/mean_terminated_length": 532.59375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.12870090634441086, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04726618155837059, + "learning_rate": 2.9553859984462393e-06, + "loss": 0.0138, + "num_tokens": 36517408.0, + "reward": 4.5626654624938965, + "reward_std": 2.7037246227264404, + "rewards/accuracy_reward/mean": 3.8126654624938965, + "rewards/accuracy_reward/std": 3.7584309577941895, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1340.0, + "completions/max_terminated_length": 1340.0, + "completions/mean_length": 614.578125, + "completions/mean_terminated_length": 614.578125, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.12930513595166163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.048972684890031815, + "learning_rate": 2.9546955373639803e-06, + "loss": 0.0395, + "num_tokens": 36666549.0, + "reward": 3.094533920288086, + "reward_std": 2.389467716217041, + "rewards/accuracy_reward/mean": 2.344533920288086, + "rewards/accuracy_reward/std": 3.4678895473480225, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1237.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 627.9375, + "completions/mean_terminated_length": 627.9375, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.1299093655589124, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.015013652853667736, + "learning_rate": 2.953999865490242e-06, + "loss": -0.0004, + "num_tokens": 36819665.0, + "reward": 0.9333359599113464, + "reward_std": 0.5123171806335449, + "rewards/accuracy_reward/mean": 0.18333593010902405, + "rewards/accuracy_reward/std": 0.9191484451293945, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 628.515625, + "completions/mean_terminated_length": 628.515625, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.13051359516616315, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.015647729858756065, + "learning_rate": 2.9532989856034515e-06, + "loss": -0.003, + "num_tokens": 36979266.0, + "reward": 4.349404335021973, + "reward_std": 0.6615608930587769, + "rewards/accuracy_reward/mean": 3.5994043350219727, + "rewards/accuracy_reward/std": 3.738983154296875, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 553.046875, + "completions/mean_terminated_length": 553.046875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.1311178247734139, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028615295886993408, + "learning_rate": 2.9525929005028343e-06, + "loss": 0.0097, + "num_tokens": 37120293.0, + "reward": 4.846029281616211, + "reward_std": 0.890335202217102, + "rewards/accuracy_reward/mean": 4.096029281616211, + "rewards/accuracy_reward/std": 3.7435965538024902, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 588.703125, + "completions/mean_terminated_length": 588.703125, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.13172205438066464, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02854621596634388, + "learning_rate": 2.951881613008407e-06, + "loss": 0.0093, + "num_tokens": 37263730.0, + "reward": 3.7388968467712402, + "reward_std": 0.9908595085144043, + "rewards/accuracy_reward/mean": 2.988896608352661, + "rewards/accuracy_reward/std": 3.676496744155884, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 502.53125, + "completions/mean_terminated_length": 502.53125, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1323262839879154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029887594282627106, + "learning_rate": 2.9511651259609638e-06, + "loss": 0.0138, + "num_tokens": 37395716.0, + "reward": 5.373959541320801, + "reward_std": 1.1285933256149292, + "rewards/accuracy_reward/mean": 4.627865791320801, + "rewards/accuracy_reward/std": 3.6015355587005615, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1198.0, + "completions/max_terminated_length": 1198.0, + "completions/mean_length": 638.171875, + "completions/mean_terminated_length": 638.171875, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.13293051359516617, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02819198928773403, + "learning_rate": 2.9504434422220645e-06, + "loss": 0.0015, + "num_tokens": 37563519.0, + "reward": 4.311934947967529, + "reward_std": 0.6251699924468994, + "rewards/accuracy_reward/mean": 3.5658411979675293, + "rewards/accuracy_reward/std": 3.8084702491760254, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 709.828125, + "completions/mean_terminated_length": 709.828125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.13353474320241693, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.023861490190029144, + "learning_rate": 2.9497165646740238e-06, + "loss": 0.0089, + "num_tokens": 37744180.0, + "reward": 6.135010242462158, + "reward_std": 0.9855507612228394, + "rewards/accuracy_reward/mean": 5.385010242462158, + "rewards/accuracy_reward/std": 3.2931201457977295, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 602.828125, + "completions/mean_terminated_length": 579.888916015625, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.13413897280966766, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03391030803322792, + "learning_rate": 2.9489844962199e-06, + "loss": 0.0014, + "num_tokens": 37874809.0, + "reward": 4.106211185455322, + "reward_std": 1.3405821323394775, + "rewards/accuracy_reward/mean": 3.3679299354553223, + "rewards/accuracy_reward/std": 3.7160542011260986, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 861.625, + "completions/mean_terminated_length": 761.084716796875, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.13474320241691842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033382024616003036, + "learning_rate": 2.948247239783484e-06, + "loss": -0.0393, + "num_tokens": 38047937.0, + "reward": 3.799567699432373, + "reward_std": 1.5691064596176147, + "rewards/accuracy_reward/mean": 3.069098949432373, + "rewards/accuracy_reward/std": 3.891141414642334, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.06762243062257767, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 572.828125, + "completions/mean_terminated_length": 572.828125, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.13534743202416918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04106619581580162, + "learning_rate": 2.947504798309285e-06, + "loss": 0.0085, + "num_tokens": 38205222.0, + "reward": 5.337768077850342, + "reward_std": 2.047592878341675, + "rewards/accuracy_reward/mean": 4.587768077850342, + "rewards/accuracy_reward/std": 3.58390474319458, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 570.671875, + "completions/mean_terminated_length": 570.671875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.13595166163141995, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03582090139389038, + "learning_rate": 2.946757174762523e-06, + "loss": -0.021, + "num_tokens": 38345505.0, + "reward": 2.2629780769348145, + "reward_std": 1.5422399044036865, + "rewards/accuracy_reward/mean": 1.512978196144104, + "rewards/accuracy_reward/std": 3.003129482269287, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 504.765625, + "completions/mean_terminated_length": 504.765625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.1365558912386707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044964034110307693, + "learning_rate": 2.9460043721291133e-06, + "loss": 0.0022, + "num_tokens": 38509618.0, + "reward": 4.424408912658691, + "reward_std": 2.8834095001220703, + "rewards/accuracy_reward/mean": 3.6744093894958496, + "rewards/accuracy_reward/std": 3.770592212677002, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 566.65625, + "completions/mean_terminated_length": 566.65625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.13716012084592144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04507065191864967, + "learning_rate": 2.945246393415654e-06, + "loss": 0.0188, + "num_tokens": 38676396.0, + "reward": 6.426874160766602, + "reward_std": 2.705090045928955, + "rewards/accuracy_reward/mean": 5.676874160766602, + "rewards/accuracy_reward/std": 3.060929298400879, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 495.453125, + "completions/mean_terminated_length": 495.453125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.1377643504531722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02278229221701622, + "learning_rate": 2.94448324164942e-06, + "loss": 0.0003, + "num_tokens": 38824937.0, + "reward": 6.571817398071289, + "reward_std": 0.8215646147727966, + "rewards/accuracy_reward/mean": 5.821817398071289, + "rewards/accuracy_reward/std": 2.966156244277954, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1323.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 642.375, + "completions/mean_terminated_length": 642.375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.13836858006042296, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034994352608919144, + "learning_rate": 2.9437149198783434e-06, + "loss": 0.022, + "num_tokens": 38991377.0, + "reward": 3.9082422256469727, + "reward_std": 1.0099338293075562, + "rewards/accuracy_reward/mean": 3.1582422256469727, + "rewards/accuracy_reward/std": 3.6639366149902344, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 582.828125, + "completions/mean_terminated_length": 559.5714721679688, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.13897280966767372, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03793824836611748, + "learning_rate": 2.942941431171006e-06, + "loss": -0.0166, + "num_tokens": 39126902.0, + "reward": 3.370452404022217, + "reward_std": 1.010443925857544, + "rewards/accuracy_reward/mean": 2.632171154022217, + "rewards/accuracy_reward/std": 3.516235113143921, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 573.359375, + "completions/mean_terminated_length": 573.359375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.13957703927492446, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03587355837225914, + "learning_rate": 2.942162778616625e-06, + "loss": -0.0135, + "num_tokens": 39262541.0, + "reward": 6.469107627868652, + "reward_std": 1.5499069690704346, + "rewards/accuracy_reward/mean": 5.719107627868652, + "rewards/accuracy_reward/std": 3.141127586364746, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1544.0, + "completions/mean_length": 744.109375, + "completions/mean_terminated_length": 723.4127197265625, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.14018126888217522, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005854463204741478, + "learning_rate": 2.9413789653250414e-06, + "loss": -0.0161, + "num_tokens": 39436468.0, + "reward": 0.9279031157493591, + "reward_std": 0.2295243889093399, + "rewards/accuracy_reward/mean": 0.18180938065052032, + "rewards/accuracy_reward/std": 0.31113898754119873, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 664.9375, + "completions/mean_terminated_length": 664.9375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.14078549848942598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04645448178052902, + "learning_rate": 2.9405899944267087e-06, + "loss": -0.0186, + "num_tokens": 39604480.0, + "reward": 2.969245433807373, + "reward_std": 2.55324649810791, + "rewards/accuracy_reward/mean": 2.230964183807373, + "rewards/accuracy_reward/std": 3.4306440353393555, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 693.8125, + "completions/mean_terminated_length": 693.8125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "epoch": 0.14138972809667674, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004811550490558147, + "learning_rate": 2.939795869072678e-06, + "loss": 0.0011, + "num_tokens": 39751540.0, + "reward": 4.41267204284668, + "reward_std": 0.1356634944677353, + "rewards/accuracy_reward/mean": 3.6665782928466797, + "rewards/accuracy_reward/std": 3.8288798332214355, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 511.203125, + "completions/mean_terminated_length": 511.203125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.1419939577039275, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028326084837317467, + "learning_rate": 2.9389965924345864e-06, + "loss": 0.0277, + "num_tokens": 39907009.0, + "reward": 5.360850811004639, + "reward_std": 0.9560673236846924, + "rewards/accuracy_reward/mean": 4.610851287841797, + "rewards/accuracy_reward/std": 3.600973606109619, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1160.0, + "completions/mean_length": 724.71875, + "completions/mean_terminated_length": 703.71435546875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.14259818731117824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033593032509088516, + "learning_rate": 2.938192167704647e-06, + "loss": -0.0081, + "num_tokens": 40052095.0, + "reward": 3.1921844482421875, + "reward_std": 1.8539037704467773, + "rewards/accuracy_reward/mean": 2.4539031982421875, + "rewards/accuracy_reward/std": 3.6278955936431885, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 427.78125, + "completions/mean_terminated_length": 427.78125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.143202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03652987256646156, + "learning_rate": 2.9373825980956302e-06, + "loss": 0.008, + "num_tokens": 40191153.0, + "reward": 7.430817127227783, + "reward_std": 1.5041812658309937, + "rewards/accuracy_reward/mean": 6.684722900390625, + "rewards/accuracy_reward/std": 2.031424045562744, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 604.453125, + "completions/mean_terminated_length": 604.453125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.14380664652567976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0451359786093235, + "learning_rate": 2.936567886840857e-06, + "loss": 0.0219, + "num_tokens": 40426478.0, + "reward": 3.143737316131592, + "reward_std": 2.5255260467529297, + "rewards/accuracy_reward/mean": 2.393737316131592, + "rewards/accuracy_reward/std": 3.404465436935425, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 532.6875, + "completions/mean_terminated_length": 532.6875, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.14441087613293052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038346484303474426, + "learning_rate": 2.935748037194182e-06, + "loss": 0.0216, + "num_tokens": 40607082.0, + "reward": 6.768865585327148, + "reward_std": 2.072831869125366, + "rewards/accuracy_reward/mean": 6.022771835327148, + "rewards/accuracy_reward/std": 2.916882276535034, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1959.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 741.21875, + "completions/mean_terminated_length": 741.21875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.14501510574018128, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03391029313206673, + "learning_rate": 2.934923052429984e-06, + "loss": 0.0067, + "num_tokens": 40794744.0, + "reward": 3.589686393737793, + "reward_std": 1.7180606126785278, + "rewards/accuracy_reward/mean": 2.839686632156372, + "rewards/accuracy_reward/std": 3.7373244762420654, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 548.015625, + "completions/mean_terminated_length": 548.015625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.14561933534743202, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.040278855711221695, + "learning_rate": 2.9340929358431483e-06, + "loss": 0.0229, + "num_tokens": 40921721.0, + "reward": 5.231960773468018, + "reward_std": 1.64393150806427, + "rewards/accuracy_reward/mean": 4.481960773468018, + "rewards/accuracy_reward/std": 3.6042582988739014, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1556.0, + "completions/max_terminated_length": 1556.0, + "completions/mean_length": 798.640625, + "completions/mean_terminated_length": 798.640625, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.14622356495468278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.023954786360263824, + "learning_rate": 2.933257690749057e-06, + "loss": 0.0263, + "num_tokens": 41073266.0, + "reward": 2.3677499294281006, + "reward_std": 0.882728099822998, + "rewards/accuracy_reward/mean": 1.6177499294281006, + "rewards/accuracy_reward/std": 3.128265857696533, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 584.234375, + "completions/mean_terminated_length": 561.0000610351562, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.14682779456193354, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04631247743964195, + "learning_rate": 2.9324173204835756e-06, + "loss": -0.0066, + "num_tokens": 41260353.0, + "reward": 2.023648738861084, + "reward_std": 1.8510980606079102, + "rewards/accuracy_reward/mean": 1.285367488861084, + "rewards/accuracy_reward/std": 2.820128917694092, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 571.65625, + "completions/mean_terminated_length": 571.65625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.1474320241691843, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04033549502491951, + "learning_rate": 2.9315718284030377e-06, + "loss": 0.0135, + "num_tokens": 41403083.0, + "reward": 5.593713760375977, + "reward_std": 2.402498960494995, + "rewards/accuracy_reward/mean": 4.843713760375977, + "rewards/accuracy_reward/std": 3.569208860397339, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 616.265625, + "completions/mean_terminated_length": 616.265625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.14803625377643503, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04232070595026016, + "learning_rate": 2.930721217884234e-06, + "loss": 0.0055, + "num_tokens": 41635964.0, + "reward": 4.557844161987305, + "reward_std": 2.018305778503418, + "rewards/accuracy_reward/mean": 3.8078441619873047, + "rewards/accuracy_reward/std": 3.6694300174713135, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 464.359375, + "completions/mean_terminated_length": 439.2222595214844, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.1486404833836858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02530074305832386, + "learning_rate": 2.929865492324397e-06, + "loss": -0.0022, + "num_tokens": 41795827.0, + "reward": 3.873990535736084, + "reward_std": 0.92746901512146, + "rewards/accuracy_reward/mean": 3.135709285736084, + "rewards/accuracy_reward/std": 3.6526050567626953, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 520.359375, + "completions/mean_terminated_length": 520.359375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.14924471299093656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04277098923921585, + "learning_rate": 2.9290046551411876e-06, + "loss": 0.0023, + "num_tokens": 41955978.0, + "reward": 6.114355087280273, + "reward_std": 2.580766439437866, + "rewards/accuracy_reward/mean": 5.364355087280273, + "rewards/accuracy_reward/std": 3.290117025375366, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 567.671875, + "completions/mean_terminated_length": 567.671875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.14984894259818732, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05857588350772858, + "learning_rate": 2.9281387097726818e-06, + "loss": 0.0317, + "num_tokens": 42109333.0, + "reward": 3.2371418476104736, + "reward_std": 2.717564105987549, + "rewards/accuracy_reward/mean": 2.4871418476104736, + "rewards/accuracy_reward/std": 3.4511525630950928, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1587.0, + "completions/max_terminated_length": 1587.0, + "completions/mean_length": 647.65625, + "completions/mean_terminated_length": 647.65625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.15045317220543808, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01743694581091404, + "learning_rate": 2.9272676596773587e-06, + "loss": -0.0019, + "num_tokens": 42260831.0, + "reward": 2.8502466678619385, + "reward_std": 0.7382071018218994, + "rewards/accuracy_reward/mean": 2.1002466678619385, + "rewards/accuracy_reward/std": 3.359677791595459, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 469.546875, + "completions/mean_terminated_length": 469.546875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.1510574018126888, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.037658993154764175, + "learning_rate": 2.926391508334083e-06, + "loss": -0.0081, + "num_tokens": 42382082.0, + "reward": 4.581892013549805, + "reward_std": 1.8936948776245117, + "rewards/accuracy_reward/mean": 3.8318920135498047, + "rewards/accuracy_reward/std": 3.7184054851531982, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 633.75, + "completions/mean_terminated_length": 633.75, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.15166163141993957, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03865529224276543, + "learning_rate": 2.9255102592420945e-06, + "loss": -0.0101, + "num_tokens": 42513618.0, + "reward": 3.0420241355895996, + "reward_std": 1.714064121246338, + "rewards/accuracy_reward/mean": 2.2959303855895996, + "rewards/accuracy_reward/std": 3.360133409500122, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 608.484375, + "completions/mean_terminated_length": 562.04833984375, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.15226586102719034, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01421197596937418, + "learning_rate": 2.924623915920992e-06, + "loss": -0.0147, + "num_tokens": 42646561.0, + "reward": 2.667158603668213, + "reward_std": 0.6170955300331116, + "rewards/accuracy_reward/mean": 1.9484083652496338, + "rewards/accuracy_reward/std": 3.3412230014801025, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.14433756470680237, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1840.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 775.96875, + "completions/mean_terminated_length": 775.96875, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.1528700906344411, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027866128832101822, + "learning_rate": 2.9237324819107205e-06, + "loss": -0.0097, + "num_tokens": 42890031.0, + "reward": 2.163419723510742, + "reward_std": 1.114722728729248, + "rewards/accuracy_reward/mean": 1.4212324619293213, + "rewards/accuracy_reward/std": 2.7955634593963623, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 574.5625, + "completions/mean_terminated_length": 574.5625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.15347432024169183, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05179426819086075, + "learning_rate": 2.9228359607715566e-06, + "loss": 0.0081, + "num_tokens": 43141299.0, + "reward": 5.543596267700195, + "reward_std": 2.287752389907837, + "rewards/accuracy_reward/mean": 4.797502517700195, + "rewards/accuracy_reward/std": 3.51196551322937, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 498.1875, + "completions/mean_terminated_length": 498.1875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.1540785498489426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03748702257871628, + "learning_rate": 2.921934356084094e-06, + "loss": 0.0198, + "num_tokens": 43379871.0, + "reward": 7.26976203918457, + "reward_std": 1.6041080951690674, + "rewards/accuracy_reward/mean": 6.51976203918457, + "rewards/accuracy_reward/std": 2.414691925048828, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 591.75, + "completions/mean_terminated_length": 591.75, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.15468277945619335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04660031944513321, + "learning_rate": 2.921027671449229e-06, + "loss": -0.0173, + "num_tokens": 43580127.0, + "reward": 4.781923294067383, + "reward_std": 1.786790370941162, + "rewards/accuracy_reward/mean": 4.035829544067383, + "rewards/accuracy_reward/std": 3.756173610687256, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 560.453125, + "completions/mean_terminated_length": 560.453125, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.15528700906344411, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03898140788078308, + "learning_rate": 2.9201159104881477e-06, + "loss": -0.0109, + "num_tokens": 43787100.0, + "reward": 3.9833967685699463, + "reward_std": 1.6535322666168213, + "rewards/accuracy_reward/mean": 3.233396530151367, + "rewards/accuracy_reward/std": 3.76283597946167, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 582.1875, + "completions/mean_terminated_length": 582.1875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.15589123867069488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03658176586031914, + "learning_rate": 2.91919907684231e-06, + "loss": 0.0243, + "num_tokens": 43950728.0, + "reward": 5.175109386444092, + "reward_std": 1.720700979232788, + "rewards/accuracy_reward/mean": 4.42510986328125, + "rewards/accuracy_reward/std": 3.7608163356781006, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 589.375, + "completions/mean_terminated_length": 589.375, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.1564954682779456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03538144379854202, + "learning_rate": 2.9182771741734347e-06, + "loss": 0.0173, + "num_tokens": 44120560.0, + "reward": 5.523087024688721, + "reward_std": 1.519083023071289, + "rewards/accuracy_reward/mean": 4.773087024688721, + "rewards/accuracy_reward/std": 3.8663580417633057, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1218.0, + "completions/mean_length": 692.609375, + "completions/mean_terminated_length": 648.8870849609375, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.15709969788519637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029979819431900978, + "learning_rate": 2.9173502061634865e-06, + "loss": -0.023, + "num_tokens": 44289143.0, + "reward": 4.946178436279297, + "reward_std": 1.0773463249206543, + "rewards/accuracy_reward/mean": 4.227427959442139, + "rewards/accuracy_reward/std": 3.807823419570923, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.13729241490364075, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1459.0, + "completions/mean_length": 772.3125, + "completions/mean_terminated_length": 731.1612548828125, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.15770392749244713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03403715044260025, + "learning_rate": 2.91641817651466e-06, + "loss": -0.0262, + "num_tokens": 44457787.0, + "reward": 2.2484796047210693, + "reward_std": 1.4450067281723022, + "rewards/accuracy_reward/mean": 1.5219171047210693, + "rewards/accuracy_reward/std": 3.1914098262786865, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 623.828125, + "completions/mean_terminated_length": 577.8870849609375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.1583081570996979, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050943680107593536, + "learning_rate": 2.915481088949366e-06, + "loss": 0.002, + "num_tokens": 44586832.0, + "reward": 4.690764427185059, + "reward_std": 2.5196590423583984, + "rewards/accuracy_reward/mean": 3.9524831771850586, + "rewards/accuracy_reward/std": 3.7608768939971924, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1396.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 649.28125, + "completions/mean_terminated_length": 649.28125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.15891238670694863, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05296681448817253, + "learning_rate": 2.9145389472102147e-06, + "loss": 0.0701, + "num_tokens": 44759570.0, + "reward": 3.787025213241577, + "reward_std": 2.515841484069824, + "rewards/accuracy_reward/mean": 3.037024974822998, + "rewards/accuracy_reward/std": 3.7019617557525635, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.0, + "completions/max_terminated_length": 1647.0, + "completions/mean_length": 721.34375, + "completions/mean_terminated_length": 721.34375, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.1595166163141994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06589177995920181, + "learning_rate": 2.913591755060004e-06, + "loss": -0.0496, + "num_tokens": 44941368.0, + "reward": 3.303603172302246, + "reward_std": 1.4725635051727295, + "rewards/accuracy_reward/mean": 2.553603172302246, + "rewards/accuracy_reward/std": 3.5930957794189453, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 660.375, + "completions/mean_terminated_length": 615.6129150390625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.16012084592145015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.012457478791475296, + "learning_rate": 2.9126395162817003e-06, + "loss": -0.0215, + "num_tokens": 45095968.0, + "reward": 6.152181625366211, + "reward_std": 0.6912950277328491, + "rewards/accuracy_reward/mean": 5.425619125366211, + "rewards/accuracy_reward/std": 3.4017176628112793, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 574.625, + "completions/mean_terminated_length": 574.625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.1607250755287009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05205154791474342, + "learning_rate": 2.9116822346784274e-06, + "loss": -0.0043, + "num_tokens": 45263112.0, + "reward": 3.8980579376220703, + "reward_std": 1.9346015453338623, + "rewards/accuracy_reward/mean": 3.1480579376220703, + "rewards/accuracy_reward/std": 3.8317956924438477, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 554.921875, + "completions/mean_terminated_length": 554.921875, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.16132930513595167, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04613035172224045, + "learning_rate": 2.9107199140734483e-06, + "loss": 0.0462, + "num_tokens": 45402259.0, + "reward": 6.214789867401123, + "reward_std": 2.62968111038208, + "rewards/accuracy_reward/mean": 5.464790344238281, + "rewards/accuracy_reward/std": 3.2511661052703857, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 531.21875, + "completions/mean_terminated_length": 531.21875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.1619335347432024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03686540201306343, + "learning_rate": 2.9097525583101523e-06, + "loss": 0.0123, + "num_tokens": 45586241.0, + "reward": 5.030254364013672, + "reward_std": 2.2618720531463623, + "rewards/accuracy_reward/mean": 4.280254364013672, + "rewards/accuracy_reward/std": 3.692824363708496, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 571.84375, + "completions/mean_terminated_length": 571.84375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.16253776435045317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03730614483356476, + "learning_rate": 2.9087801712520374e-06, + "loss": 0.0249, + "num_tokens": 45752007.0, + "reward": 5.065875053405762, + "reward_std": 1.8830809593200684, + "rewards/accuracy_reward/mean": 4.31587553024292, + "rewards/accuracy_reward/std": 3.6545732021331787, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1587.0, + "completions/max_terminated_length": 1587.0, + "completions/mean_length": 771.140625, + "completions/mean_terminated_length": 771.140625, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.16314199395770393, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0492420494556427, + "learning_rate": 2.907802756782696e-06, + "loss": 0.0643, + "num_tokens": 45966208.0, + "reward": 2.3614578247070312, + "reward_std": 1.927039623260498, + "rewards/accuracy_reward/mean": 1.6114578247070312, + "rewards/accuracy_reward/std": 2.980532646179199, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 584.875, + "completions/mean_terminated_length": 584.875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.1637462235649547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04633098468184471, + "learning_rate": 2.9068203188058003e-06, + "loss": 0.0211, + "num_tokens": 46112216.0, + "reward": 6.789831638336182, + "reward_std": 2.2427005767822266, + "rewards/accuracy_reward/mean": 6.039831638336182, + "rewards/accuracy_reward/std": 2.9247095584869385, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 599.515625, + "completions/mean_terminated_length": 599.515625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.16435045317220545, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04916281998157501, + "learning_rate": 2.905832861245085e-06, + "loss": -0.0007, + "num_tokens": 46282761.0, + "reward": 2.59173583984375, + "reward_std": 2.272418737411499, + "rewards/accuracy_reward/mean": 1.8417359590530396, + "rewards/accuracy_reward/std": 3.2765374183654785, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 611.625, + "completions/mean_terminated_length": 611.625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.16495468277945619, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04688900336623192, + "learning_rate": 2.904840388044333e-06, + "loss": -0.0188, + "num_tokens": 46442065.0, + "reward": 5.7190775871276855, + "reward_std": 1.7745592594146729, + "rewards/accuracy_reward/mean": 4.969077110290527, + "rewards/accuracy_reward/std": 3.4428093433380127, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 591.75, + "completions/mean_terminated_length": 591.75, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.16555891238670695, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046056561172008514, + "learning_rate": 2.903842903167358e-06, + "loss": 0.0271, + "num_tokens": 46586337.0, + "reward": 3.185485601425171, + "reward_std": 2.4305131435394287, + "rewards/accuracy_reward/mean": 2.43548583984375, + "rewards/accuracy_reward/std": 3.5329079627990723, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 565.4375, + "completions/mean_terminated_length": 565.4375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.1661631419939577, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050215814262628555, + "learning_rate": 2.902840410597991e-06, + "loss": 0.0261, + "num_tokens": 46742813.0, + "reward": 6.669349670410156, + "reward_std": 2.392852783203125, + "rewards/accuracy_reward/mean": 5.919349670410156, + "rewards/accuracy_reward/std": 3.053957223892212, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 520.515625, + "completions/mean_terminated_length": 520.515625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.16676737160120847, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03866554796695709, + "learning_rate": 2.901832914340062e-06, + "loss": -0.0076, + "num_tokens": 46910990.0, + "reward": 6.9937920570373535, + "reward_std": 2.166715621948242, + "rewards/accuracy_reward/mean": 6.243791580200195, + "rewards/accuracy_reward/std": 2.671415328979492, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 583.609375, + "completions/mean_terminated_length": 583.609375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.1673716012084592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027601197361946106, + "learning_rate": 2.900820418417386e-06, + "loss": -0.0048, + "num_tokens": 47063237.0, + "reward": 3.2799839973449707, + "reward_std": 0.9874410033226013, + "rewards/accuracy_reward/mean": 2.52998423576355, + "rewards/accuracy_reward/std": 3.4286468029022217, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1909.0, + "completions/mean_length": 626.609375, + "completions/mean_terminated_length": 604.0476684570312, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.16797583081570996, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023600086569786072, + "learning_rate": 2.899802926873745e-06, + "loss": 0.001, + "num_tokens": 47160956.0, + "reward": 4.320138931274414, + "reward_std": 0.6136535406112671, + "rewards/accuracy_reward/mean": 3.574045181274414, + "rewards/accuracy_reward/std": 3.7837226390838623, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1611.0, + "completions/mean_length": 676.890625, + "completions/mean_terminated_length": 632.6612548828125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.16858006042296073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03470296785235405, + "learning_rate": 2.8987804437728744e-06, + "loss": -0.0988, + "num_tokens": 47305509.0, + "reward": 6.84832763671875, + "reward_std": 1.8615984916687012, + "rewards/accuracy_reward/mean": 6.12176513671875, + "rewards/accuracy_reward/std": 2.924469232559204, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 538.859375, + "completions/mean_terminated_length": 538.859375, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.1691842900302115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0400761254131794, + "learning_rate": 2.8977529731984437e-06, + "loss": 0.0059, + "num_tokens": 47466252.0, + "reward": 3.7162904739379883, + "reward_std": 1.703370451927185, + "rewards/accuracy_reward/mean": 2.9662907123565674, + "rewards/accuracy_reward/std": 3.716614246368408, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1128.0, + "completions/max_terminated_length": 1128.0, + "completions/mean_length": 627.6875, + "completions/mean_terminated_length": 627.6875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.16978851963746225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029375839978456497, + "learning_rate": 2.896720519254042e-06, + "loss": -0.001, + "num_tokens": 47644456.0, + "reward": 2.7611618041992188, + "reward_std": 1.3776174783706665, + "rewards/accuracy_reward/mean": 2.0150680541992188, + "rewards/accuracy_reward/std": 3.3441014289855957, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 557.78125, + "completions/mean_terminated_length": 557.78125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.17039274924471298, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04742120951414108, + "learning_rate": 2.895683086063163e-06, + "loss": -0.0004, + "num_tokens": 47794234.0, + "reward": 5.392189025878906, + "reward_std": 2.809903144836426, + "rewards/accuracy_reward/mean": 4.642189025878906, + "rewards/accuracy_reward/std": 3.630307674407959, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 713.015625, + "completions/mean_terminated_length": 691.825439453125, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "epoch": 0.17099697885196374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04102597013115883, + "learning_rate": 2.8946406777691845e-06, + "loss": 0.005, + "num_tokens": 47982379.0, + "reward": 4.418495178222656, + "reward_std": 2.1849939823150635, + "rewards/accuracy_reward/mean": 3.6802139282226562, + "rewards/accuracy_reward/std": 3.8095216751098633, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 547.09375, + "completions/mean_terminated_length": 547.09375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.1716012084592145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036062922328710556, + "learning_rate": 2.893593298535356e-06, + "loss": 0.0071, + "num_tokens": 48134209.0, + "reward": 5.240157127380371, + "reward_std": 1.5950809717178345, + "rewards/accuracy_reward/mean": 4.490157127380371, + "rewards/accuracy_reward/std": 3.658067464828491, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 579.78125, + "completions/mean_terminated_length": 579.78125, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.17220543806646527, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.020503951236605644, + "learning_rate": 2.8925409525447796e-06, + "loss": 0.0084, + "num_tokens": 48389619.0, + "reward": 6.064119815826416, + "reward_std": 0.6930873394012451, + "rewards/accuracy_reward/mean": 5.314120292663574, + "rewards/accuracy_reward/std": 3.3006744384765625, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 483.671875, + "completions/mean_terminated_length": 483.671875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.172809667673716, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03564830124378204, + "learning_rate": 2.891483644000394e-06, + "loss": -0.0017, + "num_tokens": 48524238.0, + "reward": 4.796241283416748, + "reward_std": 1.4561526775360107, + "rewards/accuracy_reward/mean": 4.046241283416748, + "rewards/accuracy_reward/std": 3.931739330291748, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 555.515625, + "completions/mean_terminated_length": 555.515625, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.17341389728096676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04118945822119713, + "learning_rate": 2.890421377124958e-06, + "loss": 0.0077, + "num_tokens": 48680527.0, + "reward": 5.350277423858643, + "reward_std": 2.5462918281555176, + "rewards/accuracy_reward/mean": 4.600277900695801, + "rewards/accuracy_reward/std": 3.6264665126800537, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 582.46875, + "completions/mean_terminated_length": 582.46875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.17401812688821752, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.020825058221817017, + "learning_rate": 2.889354156161033e-06, + "loss": -0.0096, + "num_tokens": 48793933.0, + "reward": 0.9830771088600159, + "reward_std": 0.7403117418289185, + "rewards/accuracy_reward/mean": 0.23307710886001587, + "rewards/accuracy_reward/std": 1.310404658317566, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 551.625, + "completions/mean_terminated_length": 551.625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.17462235649546828, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03567584976553917, + "learning_rate": 2.8882819853709667e-06, + "loss": -0.0079, + "num_tokens": 48968853.0, + "reward": 3.880746603012085, + "reward_std": 1.794625997543335, + "rewards/accuracy_reward/mean": 3.130746841430664, + "rewards/accuracy_reward/std": 3.6939940452575684, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1093.0, + "completions/mean_length": 755.28125, + "completions/mean_terminated_length": 734.761962890625, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.17522658610271905, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06109931692481041, + "learning_rate": 2.8872048690368763e-06, + "loss": -0.0232, + "num_tokens": 49212327.0, + "reward": 5.05893087387085, + "reward_std": 3.4629766941070557, + "rewards/accuracy_reward/mean": 4.320650100708008, + "rewards/accuracy_reward/std": 3.6057755947113037, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 603.28125, + "completions/mean_terminated_length": 603.28125, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "epoch": 0.17583081570996978, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0335308201611042, + "learning_rate": 2.8861228114606293e-06, + "loss": 0.0002, + "num_tokens": 49399049.0, + "reward": 3.765578269958496, + "reward_std": 1.0002281665802002, + "rewards/accuracy_reward/mean": 3.015578269958496, + "rewards/accuracy_reward/std": 3.5793652534484863, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 553.984375, + "completions/mean_terminated_length": 530.2698974609375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.17643504531722054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0406607985496521, + "learning_rate": 2.885035816963829e-06, + "loss": -0.0295, + "num_tokens": 49526904.0, + "reward": 5.500405311584473, + "reward_std": 2.4453725814819336, + "rewards/accuracy_reward/mean": 4.762124061584473, + "rewards/accuracy_reward/std": 3.5811634063720703, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 695.28125, + "completions/mean_terminated_length": 673.8095703125, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.1770392749244713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0576370432972908, + "learning_rate": 2.8839438898877967e-06, + "loss": -0.0749, + "num_tokens": 49718410.0, + "reward": 5.069185733795166, + "reward_std": 2.7246639728546143, + "rewards/accuracy_reward/mean": 4.330904483795166, + "rewards/accuracy_reward/std": 3.6601736545562744, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 570.03125, + "completions/mean_terminated_length": 570.03125, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.17764350453172206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.019952965900301933, + "learning_rate": 2.8828470345935527e-06, + "loss": 0.0, + "num_tokens": 49901388.0, + "reward": 2.9782986640930176, + "reward_std": 0.9747496247291565, + "rewards/accuracy_reward/mean": 2.2282981872558594, + "rewards/accuracy_reward/std": 3.305105447769165, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 586.953125, + "completions/mean_terminated_length": 586.953125, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.1782477341389728, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.033348675817251205, + "learning_rate": 2.8817452554618005e-06, + "loss": 0.0009, + "num_tokens": 50062057.0, + "reward": 2.930136203765869, + "reward_std": 1.5511075258255005, + "rewards/accuracy_reward/mean": 2.180136203765869, + "rewards/accuracy_reward/std": 3.4550609588623047, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 642.046875, + "completions/mean_terminated_length": 642.046875, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.17885196374622356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04772470146417618, + "learning_rate": 2.8806385568929088e-06, + "loss": -0.0092, + "num_tokens": 50295580.0, + "reward": 3.9096899032592773, + "reward_std": 2.566070795059204, + "rewards/accuracy_reward/mean": 3.1596899032592773, + "rewards/accuracy_reward/std": 3.6940958499908447, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 612.140625, + "completions/mean_terminated_length": 589.3492431640625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.17945619335347432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03336336463689804, + "learning_rate": 2.8795269433068937e-06, + "loss": 0.0195, + "num_tokens": 50471797.0, + "reward": 5.619311332702637, + "reward_std": 1.5804426670074463, + "rewards/accuracy_reward/mean": 4.8810296058654785, + "rewards/accuracy_reward/std": 3.6163463592529297, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 524.921875, + "completions/mean_terminated_length": 524.921875, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.18006042296072508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024770982563495636, + "learning_rate": 2.878410419143402e-06, + "loss": -0.0013, + "num_tokens": 50619648.0, + "reward": 7.675644874572754, + "reward_std": 0.9768667221069336, + "rewards/accuracy_reward/mean": 6.9256439208984375, + "rewards/accuracy_reward/std": 2.0629353523254395, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 466.0625, + "completions/mean_terminated_length": 466.0625, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.18066465256797584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0339764729142189, + "learning_rate": 2.877288988861691e-06, + "loss": 0.0116, + "num_tokens": 50782228.0, + "reward": 7.054330348968506, + "reward_std": 1.4812289476394653, + "rewards/accuracy_reward/mean": 6.304330348968506, + "rewards/accuracy_reward/std": 2.5756661891937256, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 511.0625, + "completions/mean_terminated_length": 511.0625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.18126888217522658, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0383390448987484, + "learning_rate": 2.876162656940614e-06, + "loss": 0.0088, + "num_tokens": 50941336.0, + "reward": 6.668456554412842, + "reward_std": 1.549542784690857, + "rewards/accuracy_reward/mean": 5.918456554412842, + "rewards/accuracy_reward/std": 3.0534772872924805, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 544.890625, + "completions/mean_terminated_length": 521.0317993164062, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.18187311178247734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03392835706472397, + "learning_rate": 2.8750314278786016e-06, + "loss": -0.0079, + "num_tokens": 51125361.0, + "reward": 3.2643473148345947, + "reward_std": 1.5975382328033447, + "rewards/accuracy_reward/mean": 2.526066303253174, + "rewards/accuracy_reward/std": 3.588475227355957, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 483.796875, + "completions/mean_terminated_length": 483.796875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.1824773413897281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05143491178750992, + "learning_rate": 2.8738953061936405e-06, + "loss": 0.0074, + "num_tokens": 51261028.0, + "reward": 5.698363304138184, + "reward_std": 3.0150442123413086, + "rewards/accuracy_reward/mean": 4.948363304138184, + "rewards/accuracy_reward/std": 3.4851834774017334, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 475.8125, + "completions/mean_terminated_length": 475.8125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.18308157099697886, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023621659725904465, + "learning_rate": 2.8727542964232595e-06, + "loss": 0.0076, + "num_tokens": 51389992.0, + "reward": 5.733273983001709, + "reward_std": 0.9065025448799133, + "rewards/accuracy_reward/mean": 4.983273983001709, + "rewards/accuracy_reward/std": 3.510225772857666, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 546.328125, + "completions/mean_terminated_length": 546.328125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.18368580060422962, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.025650301948189735, + "learning_rate": 2.8716084031245094e-06, + "loss": 0.0131, + "num_tokens": 51560541.0, + "reward": 2.9247899055480957, + "reward_std": 0.7727554440498352, + "rewards/accuracy_reward/mean": 2.1747896671295166, + "rewards/accuracy_reward/std": 3.4324426651000977, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 498.3125, + "completions/mean_terminated_length": 498.3125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.18429003021148035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029566025361418724, + "learning_rate": 2.8704576308739454e-06, + "loss": 0.0037, + "num_tokens": 51694529.0, + "reward": 7.717215538024902, + "reward_std": 1.1610198020935059, + "rewards/accuracy_reward/mean": 6.967215538024902, + "rewards/accuracy_reward/std": 1.5643384456634521, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 621.359375, + "completions/mean_terminated_length": 575.3386840820312, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.18489425981873112, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.032997604459524155, + "learning_rate": 2.869301984267609e-06, + "loss": -0.0152, + "num_tokens": 51822024.0, + "reward": 3.0059046745300293, + "reward_std": 1.504867434501648, + "rewards/accuracy_reward/mean": 2.2793421745300293, + "rewards/accuracy_reward/std": 3.515826463699341, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1146.0, + "completions/max_terminated_length": 1146.0, + "completions/mean_length": 665.875, + "completions/mean_terminated_length": 665.875, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.18549848942598188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0462837889790535, + "learning_rate": 2.868141467921008e-06, + "loss": 0.0275, + "num_tokens": 52006720.0, + "reward": 4.170754432678223, + "reward_std": 1.558622121810913, + "rewards/accuracy_reward/mean": 3.4246606826782227, + "rewards/accuracy_reward/std": 3.711592674255371, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 607.515625, + "completions/mean_terminated_length": 607.515625, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.18610271903323264, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04394195228815079, + "learning_rate": 2.8669760864691005e-06, + "loss": 0.0203, + "num_tokens": 52175681.0, + "reward": 4.732952117919922, + "reward_std": 2.0308585166931152, + "rewards/accuracy_reward/mean": 3.982952117919922, + "rewards/accuracy_reward/std": 3.641293525695801, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 622.1875, + "completions/mean_terminated_length": 576.1935424804688, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.18670694864048337, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04428974911570549, + "learning_rate": 2.8658058445662756e-06, + "loss": 0.0004, + "num_tokens": 52328781.0, + "reward": 1.6486968994140625, + "reward_std": 1.2697091102600098, + "rewards/accuracy_reward/mean": 0.9299468994140625, + "rewards/accuracy_reward/std": 2.496145248413086, + "rewards/tag_count_reward/mean": 0.71875, + "rewards/tag_count_reward/std": 0.14433756470680237, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1393.0, + "completions/max_terminated_length": 1393.0, + "completions/mean_length": 632.53125, + "completions/mean_terminated_length": 632.53125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.18731117824773413, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04084528982639313, + "learning_rate": 2.8646307468863327e-06, + "loss": -0.0141, + "num_tokens": 52494831.0, + "reward": 4.64107608795166, + "reward_std": 1.743304967880249, + "rewards/accuracy_reward/mean": 3.89107608795166, + "rewards/accuracy_reward/std": 3.5888285636901855, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 679.84375, + "completions/mean_terminated_length": 679.84375, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.1879154078549849, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04823148623108864, + "learning_rate": 2.863450798122466e-06, + "loss": 0.0784, + "num_tokens": 52671093.0, + "reward": 7.007308483123779, + "reward_std": 2.191516160964966, + "rewards/accuracy_reward/mean": 6.257308483123779, + "rewards/accuracy_reward/std": 2.6099486351013184, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1270.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 513.84375, + "completions/mean_terminated_length": 513.84375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.18851963746223566, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.060451727360486984, + "learning_rate": 2.862266002987244e-06, + "loss": 0.055, + "num_tokens": 52855467.0, + "reward": 3.2377142906188965, + "reward_std": 2.416351318359375, + "rewards/accuracy_reward/mean": 2.4877142906188965, + "rewards/accuracy_reward/std": 3.6264843940734863, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 427.703125, + "completions/mean_terminated_length": 427.703125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.18912386706948642, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04745763912796974, + "learning_rate": 2.86107636621259e-06, + "loss": 0.0034, + "num_tokens": 53141928.0, + "reward": 4.169847011566162, + "reward_std": 2.171255111694336, + "rewards/accuracy_reward/mean": 3.419846534729004, + "rewards/accuracy_reward/std": 3.9364054203033447, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 508.203125, + "completions/mean_terminated_length": 508.203125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.18972809667673715, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030896611511707306, + "learning_rate": 2.859881892549766e-06, + "loss": 0.0023, + "num_tokens": 53287477.0, + "reward": 5.245253562927246, + "reward_std": 1.4855480194091797, + "rewards/accuracy_reward/mean": 4.495253562927246, + "rewards/accuracy_reward/std": 3.6622567176818848, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 696.71875, + "completions/mean_terminated_length": 696.71875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.1903323262839879, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042556993663311005, + "learning_rate": 2.858682586769352e-06, + "loss": -0.0081, + "num_tokens": 53456835.0, + "reward": 3.5006415843963623, + "reward_std": 2.128211498260498, + "rewards/accuracy_reward/mean": 2.7506415843963623, + "rewards/accuracy_reward/std": 3.5483973026275635, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 575.3125, + "completions/mean_terminated_length": 575.3125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.19093655589123867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044105637818574905, + "learning_rate": 2.857478453661224e-06, + "loss": 0.0139, + "num_tokens": 53642103.0, + "reward": 3.6492412090301514, + "reward_std": 2.778947353363037, + "rewards/accuracy_reward/mean": 2.8992414474487305, + "rewards/accuracy_reward/std": 3.6073555946350098, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 667.15625, + "completions/mean_terminated_length": 667.15625, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.19154078549848944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039773907512426376, + "learning_rate": 2.85626949803454e-06, + "loss": 0.0301, + "num_tokens": 53901089.0, + "reward": 1.518271803855896, + "reward_std": 1.6749372482299805, + "rewards/accuracy_reward/mean": 0.7682718634605408, + "rewards/accuracy_reward/std": 2.3705828189849854, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 429.71875, + "completions/mean_terminated_length": 429.71875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.19214501510574017, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027672428637742996, + "learning_rate": 2.8550557247177197e-06, + "loss": 0.0005, + "num_tokens": 54045791.0, + "reward": 5.721514701843262, + "reward_std": 1.3278844356536865, + "rewards/accuracy_reward/mean": 4.9715142250061035, + "rewards/accuracy_reward/std": 3.455305576324463, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 553.34375, + "completions/mean_terminated_length": 553.34375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.19274924471299093, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03373998776078224, + "learning_rate": 2.853837138558421e-06, + "loss": 0.0356, + "num_tokens": 54214885.0, + "reward": 3.6208295822143555, + "reward_std": 1.463158369064331, + "rewards/accuracy_reward/mean": 2.8786423206329346, + "rewards/accuracy_reward/std": 3.6585569381713867, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.0625, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 703.53125, + "completions/mean_terminated_length": 703.53125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.1933534743202417, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05145782232284546, + "learning_rate": 2.8526137444235257e-06, + "loss": 0.0179, + "num_tokens": 54441927.0, + "reward": 4.802280426025391, + "reward_std": 1.8394544124603271, + "rewards/accuracy_reward/mean": 4.052280902862549, + "rewards/accuracy_reward/std": 3.7585175037384033, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 639.015625, + "completions/mean_terminated_length": 639.015625, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.19395770392749245, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04275014251470566, + "learning_rate": 2.851385547199118e-06, + "loss": -0.0057, + "num_tokens": 54565512.0, + "reward": 3.0721583366394043, + "reward_std": 1.8185070753097534, + "rewards/accuracy_reward/mean": 2.3221583366394043, + "rewards/accuracy_reward/std": 3.492722272872925, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 621.015625, + "completions/mean_terminated_length": 621.015625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.19456193353474321, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.021120425313711166, + "learning_rate": 2.850152551790464e-06, + "loss": 0.0009, + "num_tokens": 54723833.0, + "reward": 2.5728487968444824, + "reward_std": 1.0511932373046875, + "rewards/accuracy_reward/mean": 1.8345675468444824, + "rewards/accuracy_reward/std": 3.2801332473754883, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 831.15625, + "completions/mean_terminated_length": 728.0338745117188, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.19516616314199395, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029301516711711884, + "learning_rate": 2.848914763121994e-06, + "loss": -0.0675, + "num_tokens": 54882691.0, + "reward": 1.071610927581787, + "reward_std": 0.9859980344772339, + "rewards/accuracy_reward/mean": 0.3802046775817871, + "rewards/accuracy_reward/std": 1.6428438425064087, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.2028672844171524, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1094.0, + "completions/max_terminated_length": 1094.0, + "completions/mean_length": 585.078125, + "completions/mean_terminated_length": 585.078125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.1957703927492447, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03762906789779663, + "learning_rate": 2.847672186137282e-06, + "loss": -0.0025, + "num_tokens": 55129912.0, + "reward": 3.9450442790985107, + "reward_std": 1.46536386013031, + "rewards/accuracy_reward/mean": 3.2067627906799316, + "rewards/accuracy_reward/std": 3.7325448989868164, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1326.0, + "completions/mean_length": 744.34375, + "completions/mean_terminated_length": 723.6508178710938, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.19637462235649547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043559480458498, + "learning_rate": 2.8464248257990262e-06, + "loss": -0.0497, + "num_tokens": 55325166.0, + "reward": 3.5702157020568848, + "reward_std": 2.4184439182281494, + "rewards/accuracy_reward/mean": 2.8280282020568848, + "rewards/accuracy_reward/std": 3.6202828884124756, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 809.46875, + "completions/mean_terminated_length": 789.8095703125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.19697885196374623, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.035453181713819504, + "learning_rate": 2.8451726870890274e-06, + "loss": -0.0214, + "num_tokens": 55506716.0, + "reward": 1.5122835636138916, + "reward_std": 1.3252880573272705, + "rewards/accuracy_reward/mean": 0.7740023732185364, + "rewards/accuracy_reward/std": 1.9775466918945312, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 612.78125, + "completions/mean_terminated_length": 612.78125, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.19758308157099697, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.032914530485868454, + "learning_rate": 2.843915775008172e-06, + "loss": 0.004, + "num_tokens": 55759374.0, + "reward": 3.7475764751434326, + "reward_std": 1.0039777755737305, + "rewards/accuracy_reward/mean": 2.9975762367248535, + "rewards/accuracy_reward/std": 3.6869051456451416, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 574.203125, + "completions/mean_terminated_length": 574.203125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.19818731117824773, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051278989762067795, + "learning_rate": 2.8426540945764106e-06, + "loss": 0.0291, + "num_tokens": 55951019.0, + "reward": 6.0466227531433105, + "reward_std": 3.0774035453796387, + "rewards/accuracy_reward/mean": 5.2966227531433105, + "rewards/accuracy_reward/std": 3.3997247219085693, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 500.96875, + "completions/mean_terminated_length": 500.96875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.1987915407854985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05263667181134224, + "learning_rate": 2.841387650832738e-06, + "loss": 0.0137, + "num_tokens": 56074185.0, + "reward": 6.2880706787109375, + "reward_std": 2.5221376419067383, + "rewards/accuracy_reward/mean": 5.5380706787109375, + "rewards/accuracy_reward/std": 3.2234015464782715, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 665.609375, + "completions/mean_terminated_length": 665.609375, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "epoch": 0.19939577039274925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03015376813709736, + "learning_rate": 2.840116448835171e-06, + "loss": 0.0329, + "num_tokens": 56243712.0, + "reward": 4.067489147186279, + "reward_std": 1.3612611293792725, + "rewards/accuracy_reward/mean": 3.3174889087677, + "rewards/accuracy_reward/std": 3.7072811126708984, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 579.234375, + "completions/mean_terminated_length": 579.234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04107663035392761, + "learning_rate": 2.8388404936607345e-06, + "loss": -0.0132, + "num_tokens": 56423647.0, + "reward": 4.704195976257324, + "reward_std": 2.18276309967041, + "rewards/accuracy_reward/mean": 3.9541962146759033, + "rewards/accuracy_reward/std": 3.8146166801452637, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 620.65625, + "completions/mean_terminated_length": 620.65625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.20060422960725074, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04212847724556923, + "learning_rate": 2.8375597904054334e-06, + "loss": 0.0181, + "num_tokens": 56626873.0, + "reward": 3.1676836013793945, + "reward_std": 2.0720534324645996, + "rewards/accuracy_reward/mean": 2.4176836013793945, + "rewards/accuracy_reward/std": 3.3789708614349365, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 454.484375, + "completions/mean_terminated_length": 454.484375, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.2012084592145015, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03343146666884422, + "learning_rate": 2.8362743441842364e-06, + "loss": -0.0198, + "num_tokens": 56847336.0, + "reward": 4.178601264953613, + "reward_std": 1.6406760215759277, + "rewards/accuracy_reward/mean": 3.4286012649536133, + "rewards/accuracy_reward/std": 3.7464778423309326, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 475.578125, + "completions/mean_terminated_length": 475.578125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.20181268882175227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05259865149855614, + "learning_rate": 2.834984160131057e-06, + "loss": 0.0219, + "num_tokens": 57002685.0, + "reward": 5.287570953369141, + "reward_std": 3.1856470108032227, + "rewards/accuracy_reward/mean": 4.537569999694824, + "rewards/accuracy_reward/std": 3.579472780227661, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 608.015625, + "completions/mean_terminated_length": 608.015625, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.20241691842900303, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03501332551240921, + "learning_rate": 2.833689243398728e-06, + "loss": -0.0019, + "num_tokens": 57162078.0, + "reward": 2.912978172302246, + "reward_std": 1.5541951656341553, + "rewards/accuracy_reward/mean": 2.162978410720825, + "rewards/accuracy_reward/std": 3.317505121231079, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1267.0, + "completions/max_terminated_length": 1267.0, + "completions/mean_length": 701.296875, + "completions/mean_terminated_length": 701.296875, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.2030211480362538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04747768118977547, + "learning_rate": 2.8323895991589866e-06, + "loss": 0.0173, + "num_tokens": 57356705.0, + "reward": 5.069206714630127, + "reward_std": 2.330598831176758, + "rewards/accuracy_reward/mean": 4.319206714630127, + "rewards/accuracy_reward/std": 3.6357781887054443, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 579.984375, + "completions/mean_terminated_length": 579.984375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.20362537764350452, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04213012009859085, + "learning_rate": 2.8310852326024497e-06, + "loss": -0.0273, + "num_tokens": 57501520.0, + "reward": 4.169308662414551, + "reward_std": 1.9551122188568115, + "rewards/accuracy_reward/mean": 3.4310271739959717, + "rewards/accuracy_reward/std": 3.7822256088256836, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 574.34375, + "completions/mean_terminated_length": 574.34375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.20422960725075529, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.013150525279343128, + "learning_rate": 2.829776148938596e-06, + "loss": -0.0024, + "num_tokens": 57664150.0, + "reward": 0.5993921756744385, + "reward_std": 0.6079599857330322, + "rewards/accuracy_reward/mean": -0.15060780942440033, + "rewards/accuracy_reward/std": 1.0520886182785034, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 469.53125, + "completions/mean_terminated_length": 469.53125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.20483383685800605, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020300643518567085, + "learning_rate": 2.82846235339574e-06, + "loss": 0.0023, + "num_tokens": 57861128.0, + "reward": 6.076446056365967, + "reward_std": 0.6532200574874878, + "rewards/accuracy_reward/mean": 5.326446533203125, + "rewards/accuracy_reward/std": 3.343480110168457, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 685.15625, + "completions/mean_terminated_length": 685.15625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.2054380664652568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05129655450582504, + "learning_rate": 2.8271438512210196e-06, + "loss": -0.0387, + "num_tokens": 58062722.0, + "reward": 5.719560623168945, + "reward_std": 2.9254727363586426, + "rewards/accuracy_reward/mean": 4.969560623168945, + "rewards/accuracy_reward/std": 3.454256772994995, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 472.0625, + "completions/mean_terminated_length": 472.0625, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.20604229607250754, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024255309253931046, + "learning_rate": 2.825820647680368e-06, + "loss": 0.0018, + "num_tokens": 58198886.0, + "reward": 3.153049945831299, + "reward_std": 0.9314358830451965, + "rewards/accuracy_reward/mean": 2.406956195831299, + "rewards/accuracy_reward/std": 3.5075862407684326, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 501.5625, + "completions/mean_terminated_length": 477.0158996582031, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.2066465256797583, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03289877995848656, + "learning_rate": 2.8244927480584954e-06, + "loss": -0.0004, + "num_tokens": 58326026.0, + "reward": 5.39091682434082, + "reward_std": 1.5359001159667969, + "rewards/accuracy_reward/mean": 4.65263557434082, + "rewards/accuracy_reward/std": 3.6171464920043945, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 502.328125, + "completions/mean_terminated_length": 477.7936706542969, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.20725075528700906, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01790809817612171, + "learning_rate": 2.8231601576588664e-06, + "loss": -0.0613, + "num_tokens": 58463135.0, + "reward": 5.834895133972168, + "reward_std": 1.2763961553573608, + "rewards/accuracy_reward/mean": 5.09661340713501, + "rewards/accuracy_reward/std": 3.4508111476898193, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 665.09375, + "completions/mean_terminated_length": 665.09375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.20785498489425983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04738441854715347, + "learning_rate": 2.8218228818036828e-06, + "loss": 0.0219, + "num_tokens": 58617605.0, + "reward": 4.08117151260376, + "reward_std": 1.9007573127746582, + "rewards/accuracy_reward/mean": 3.3311715126037598, + "rewards/accuracy_reward/std": 3.7065107822418213, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 777.0, + "completions/max_terminated_length": 777.0, + "completions/mean_length": 543.75, + "completions/mean_terminated_length": 543.75, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.2084592145015106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05082084238529205, + "learning_rate": 2.820480925833856e-06, + "loss": 0.0024, + "num_tokens": 58766965.0, + "reward": 5.30200719833374, + "reward_std": 2.9830708503723145, + "rewards/accuracy_reward/mean": 4.55200719833374, + "rewards/accuracy_reward/std": 3.659163236618042, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1001.0, + "completions/max_terminated_length": 1001.0, + "completions/mean_length": 679.25, + "completions/mean_terminated_length": 679.25, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.20906344410876132, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04837791621685028, + "learning_rate": 2.819134295108992e-06, + "loss": -0.0091, + "num_tokens": 58934677.0, + "reward": 4.6094279289245605, + "reward_std": 2.568563222885132, + "rewards/accuracy_reward/mean": 3.8594279289245605, + "rewards/accuracy_reward/std": 3.7310562133789062, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1155.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 638.140625, + "completions/mean_terminated_length": 638.140625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.20966767371601208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05773070827126503, + "learning_rate": 2.8177829950073664e-06, + "loss": 0.0438, + "num_tokens": 59102046.0, + "reward": 4.508519172668457, + "reward_std": 3.194716453552246, + "rewards/accuracy_reward/mean": 3.758518934249878, + "rewards/accuracy_reward/std": 3.638535976409912, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 742.8125, + "completions/mean_terminated_length": 742.8125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.21027190332326284, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03144252300262451, + "learning_rate": 2.8164270309259034e-06, + "loss": 0.0023, + "num_tokens": 59274738.0, + "reward": 1.6391047239303589, + "reward_std": 1.1225427389144897, + "rewards/accuracy_reward/mean": 0.8891047239303589, + "rewards/accuracy_reward/std": 2.730302572250366, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 645.640625, + "completions/mean_terminated_length": 645.640625, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.2108761329305136, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.048049021512269974, + "learning_rate": 2.8150664082801537e-06, + "loss": 0.0117, + "num_tokens": 59419099.0, + "reward": 2.906515598297119, + "reward_std": 2.3188655376434326, + "rewards/accuracy_reward/mean": 2.156515598297119, + "rewards/accuracy_reward/std": 3.4725019931793213, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 581.75, + "completions/mean_terminated_length": 581.75, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.21148036253776434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03527412936091423, + "learning_rate": 2.8137011325042757e-06, + "loss": 0.0219, + "num_tokens": 59586539.0, + "reward": 3.881635904312134, + "reward_std": 1.4620623588562012, + "rewards/accuracy_reward/mean": 3.131635904312134, + "rewards/accuracy_reward/std": 3.661097288131714, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1706.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 630.96875, + "completions/mean_terminated_length": 630.96875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.2120845921450151, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038255881518125534, + "learning_rate": 2.8123312090510106e-06, + "loss": 0.03, + "num_tokens": 59760153.0, + "reward": 5.048037052154541, + "reward_std": 2.102996826171875, + "rewards/accuracy_reward/mean": 4.298037528991699, + "rewards/accuracy_reward/std": 3.719632625579834, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 522.546875, + "completions/mean_terminated_length": 522.546875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.21268882175226586, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.042473532259464264, + "learning_rate": 2.810956643391662e-06, + "loss": 0.0076, + "num_tokens": 59903020.0, + "reward": 5.085963249206543, + "reward_std": 1.4104541540145874, + "rewards/accuracy_reward/mean": 4.335963249206543, + "rewards/accuracy_reward/std": 3.618025302886963, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 467.171875, + "completions/mean_terminated_length": 467.171875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.21329305135951662, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03250286355614662, + "learning_rate": 2.8095774410160737e-06, + "loss": 0.0039, + "num_tokens": 60053527.0, + "reward": 5.409519672393799, + "reward_std": 1.5330241918563843, + "rewards/accuracy_reward/mean": 4.659519195556641, + "rewards/accuracy_reward/std": 3.475072145462036, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1092.0, + "completions/max_terminated_length": 1092.0, + "completions/mean_length": 561.796875, + "completions/mean_terminated_length": 561.796875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.21389728096676738, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03228819742798805, + "learning_rate": 2.808193607432609e-06, + "loss": -0.0005, + "num_tokens": 60169386.0, + "reward": 1.196713924407959, + "reward_std": 1.2962055206298828, + "rewards/accuracy_reward/mean": 0.45062020421028137, + "rewards/accuracy_reward/std": 1.8283802270889282, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 510.578125, + "completions/mean_terminated_length": 510.578125, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.21450151057401812, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04013601690530777, + "learning_rate": 2.8068051481681255e-06, + "loss": 0.0218, + "num_tokens": 60351535.0, + "reward": 3.541231155395508, + "reward_std": 1.9081387519836426, + "rewards/accuracy_reward/mean": 2.791231155395508, + "rewards/accuracy_reward/std": 3.666602611541748, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 724.421875, + "completions/mean_terminated_length": 724.421875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.21510574018126888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04438190907239914, + "learning_rate": 2.805412068767958e-06, + "loss": 0.0138, + "num_tokens": 60510650.0, + "reward": 4.2879133224487305, + "reward_std": 1.9252315759658813, + "rewards/accuracy_reward/mean": 3.5379133224487305, + "rewards/accuracy_reward/std": 3.702610492706299, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 505.46875, + "completions/mean_terminated_length": 505.46875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.21570996978851964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03804086521267891, + "learning_rate": 2.8040143747958912e-06, + "loss": 0.0107, + "num_tokens": 60671528.0, + "reward": 6.410764694213867, + "reward_std": 1.7684028148651123, + "rewards/accuracy_reward/mean": 5.660765171051025, + "rewards/accuracy_reward/std": 3.0248849391937256, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.0, + "completions/max_terminated_length": 695.0, + "completions/mean_length": 488.25, + "completions/mean_terminated_length": 488.25, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.2163141993957704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025046300143003464, + "learning_rate": 2.802612071834141e-06, + "loss": -0.0052, + "num_tokens": 60803736.0, + "reward": 4.642375469207764, + "reward_std": 1.1469752788543701, + "rewards/accuracy_reward/mean": 3.8923757076263428, + "rewards/accuracy_reward/std": 3.5977511405944824, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 526.140625, + "completions/mean_terminated_length": 526.140625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.21691842900302113, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03585590422153473, + "learning_rate": 2.8012051654833314e-06, + "loss": -0.0168, + "num_tokens": 60981073.0, + "reward": 5.139657974243164, + "reward_std": 1.0156636238098145, + "rewards/accuracy_reward/mean": 4.389657497406006, + "rewards/accuracy_reward/std": 3.6940176486968994, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 473.015625, + "completions/mean_terminated_length": 473.015625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.2175226586102719, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.021196121349930763, + "learning_rate": 2.79979366136247e-06, + "loss": -0.0077, + "num_tokens": 61123970.0, + "reward": 2.5734496116638184, + "reward_std": 0.9135377407073975, + "rewards/accuracy_reward/mean": 1.8234494924545288, + "rewards/accuracy_reward/std": 3.18345046043396, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 569.078125, + "completions/mean_terminated_length": 569.078125, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.21812688821752266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04356391727924347, + "learning_rate": 2.798377565108929e-06, + "loss": -0.0159, + "num_tokens": 61283799.0, + "reward": 3.9833054542541504, + "reward_std": 2.20652174949646, + "rewards/accuracy_reward/mean": 3.2333054542541504, + "rewards/accuracy_reward/std": 3.7292678356170654, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 504.953125, + "completions/mean_terminated_length": 504.953125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.21873111782477342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01662577874958515, + "learning_rate": 2.796956882378421e-06, + "loss": -0.0069, + "num_tokens": 61431796.0, + "reward": 6.479079246520996, + "reward_std": 0.5077196955680847, + "rewards/accuracy_reward/mean": 5.729079246520996, + "rewards/accuracy_reward/std": 3.0593061447143555, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 552.125, + "completions/mean_terminated_length": 552.125, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.21933534743202418, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040273021906614304, + "learning_rate": 2.795531618844975e-06, + "loss": 0.0115, + "num_tokens": 61605964.0, + "reward": 7.6575751304626465, + "reward_std": 1.3660824298858643, + "rewards/accuracy_reward/mean": 6.915387153625488, + "rewards/accuracy_reward/std": 1.8695274591445923, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.0625, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 576.09375, + "completions/mean_terminated_length": 576.09375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.2199395770392749, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.041846442967653275, + "learning_rate": 2.794101780200916e-06, + "loss": -0.0099, + "num_tokens": 61798626.0, + "reward": 4.092416763305664, + "reward_std": 2.28971791267395, + "rewards/accuracy_reward/mean": 3.342416763305664, + "rewards/accuracy_reward/std": 3.6529245376586914, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 551.71875, + "completions/mean_terminated_length": 527.96826171875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.22054380664652568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06118430569767952, + "learning_rate": 2.7926673721568423e-06, + "loss": 0.0161, + "num_tokens": 61973712.0, + "reward": 6.973851680755615, + "reward_std": 2.855766773223877, + "rewards/accuracy_reward/mean": 6.247289657592773, + "rewards/accuracy_reward/std": 2.762302875518799, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1315.0, + "completions/max_terminated_length": 1315.0, + "completions/mean_length": 709.703125, + "completions/mean_terminated_length": 709.703125, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "epoch": 0.22114803625377644, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04362497478723526, + "learning_rate": 2.791228400441601e-06, + "loss": 0.0012, + "num_tokens": 62137485.0, + "reward": 3.028078556060791, + "reward_std": 1.8901805877685547, + "rewards/accuracy_reward/mean": 2.278078556060791, + "rewards/accuracy_reward/std": 3.378589630126953, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 631.875, + "completions/mean_terminated_length": 631.875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.2217522658610272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0455370731651783, + "learning_rate": 2.7897848708022646e-06, + "loss": -0.026, + "num_tokens": 62318885.0, + "reward": 1.9572060108184814, + "reward_std": 2.3593645095825195, + "rewards/accuracy_reward/mean": 1.207205891609192, + "rewards/accuracy_reward/std": 2.7304201126098633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1167.0, + "completions/max_terminated_length": 1167.0, + "completions/mean_length": 643.296875, + "completions/mean_terminated_length": 643.296875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.22235649546827796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037439536303281784, + "learning_rate": 2.7883367890041123e-06, + "loss": 0.0351, + "num_tokens": 62500504.0, + "reward": 3.0937395095825195, + "reward_std": 2.0415186882019043, + "rewards/accuracy_reward/mean": 2.3437395095825195, + "rewards/accuracy_reward/std": 3.3003382682800293, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 572.734375, + "completions/mean_terminated_length": 572.734375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.2229607250755287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03989775478839874, + "learning_rate": 2.786884160830601e-06, + "loss": 0.0199, + "num_tokens": 62655095.0, + "reward": 3.7768263816833496, + "reward_std": 2.500540256500244, + "rewards/accuracy_reward/mean": 3.0307328701019287, + "rewards/accuracy_reward/std": 3.9461681842803955, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 561.25, + "completions/mean_terminated_length": 537.6508178710938, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.22356495468277945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05075139179825783, + "learning_rate": 2.7854269920833477e-06, + "loss": 0.0286, + "num_tokens": 62898871.0, + "reward": 4.962396621704102, + "reward_std": 2.927757740020752, + "rewards/accuracy_reward/mean": 4.216302871704102, + "rewards/accuracy_reward/std": 3.7549078464508057, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 542.46875, + "completions/mean_terminated_length": 542.46875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.22416918429003022, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022479213774204254, + "learning_rate": 2.7839652885821024e-06, + "loss": 0.0017, + "num_tokens": 63059317.0, + "reward": 4.286976337432861, + "reward_std": 0.70224529504776, + "rewards/accuracy_reward/mean": 3.5369763374328613, + "rewards/accuracy_reward/std": 3.7130086421966553, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1241.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 482.96875, + "completions/mean_terminated_length": 482.96875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.22477341389728098, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03703438863158226, + "learning_rate": 2.7824990561647276e-06, + "loss": 0.0378, + "num_tokens": 63202995.0, + "reward": 5.818498611450195, + "reward_std": 0.8822580575942993, + "rewards/accuracy_reward/mean": 5.068498611450195, + "rewards/accuracy_reward/std": 3.4807682037353516, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 656.5625, + "completions/mean_terminated_length": 656.5625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.2253776435045317, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04146628826856613, + "learning_rate": 2.781028300687172e-06, + "loss": 0.0042, + "num_tokens": 63418135.0, + "reward": 3.565972089767456, + "reward_std": 1.061457633972168, + "rewards/accuracy_reward/mean": 2.815971851348877, + "rewards/accuracy_reward/std": 3.647961378097534, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 549.453125, + "completions/mean_terminated_length": 525.6666870117188, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.22598187311178247, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03022138401865959, + "learning_rate": 2.7795530280234504e-06, + "loss": 0.025, + "num_tokens": 63520852.0, + "reward": 3.662677526473999, + "reward_std": 1.1235084533691406, + "rewards/accuracy_reward/mean": 2.92439603805542, + "rewards/accuracy_reward/std": 3.666189432144165, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 536.15625, + "completions/mean_terminated_length": 512.1587524414062, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.22658610271903323, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.031701646745204926, + "learning_rate": 2.7780732440656176e-06, + "loss": -0.012, + "num_tokens": 63650094.0, + "reward": 1.8690240383148193, + "reward_std": 1.0519715547561646, + "rewards/accuracy_reward/mean": 1.1307427883148193, + "rewards/accuracy_reward/std": 2.7408080101013184, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 609.09375, + "completions/mean_terminated_length": 609.09375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.227190332326284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05556204542517662, + "learning_rate": 2.7765889547237466e-06, + "loss": -0.0143, + "num_tokens": 63803396.0, + "reward": 4.370233058929443, + "reward_std": 2.744415283203125, + "rewards/accuracy_reward/mean": 3.6202330589294434, + "rewards/accuracy_reward/std": 3.7180163860321045, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 673.75, + "completions/mean_terminated_length": 651.9365234375, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.22779456193353476, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034318629652261734, + "learning_rate": 2.7751001659259044e-06, + "loss": -0.0003, + "num_tokens": 64048724.0, + "reward": 1.487889051437378, + "reward_std": 1.3843109607696533, + "rewards/accuracy_reward/mean": 0.7496078014373779, + "rewards/accuracy_reward/std": 2.167509078979492, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1368.0, + "completions/mean_length": 703.796875, + "completions/mean_terminated_length": 660.4354858398438, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.2283987915407855, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.039266377687454224, + "learning_rate": 2.7736068836181283e-06, + "loss": 0.025, + "num_tokens": 64214311.0, + "reward": 2.7285349369049072, + "reward_std": 1.635340929031372, + "rewards/accuracy_reward/mean": 2.0136911869049072, + "rewards/accuracy_reward/std": 3.6303579807281494, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 471.390625, + "completions/mean_terminated_length": 471.390625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.22900302114803625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03907524049282074, + "learning_rate": 2.7721091137644007e-06, + "loss": 0.0385, + "num_tokens": 64332304.0, + "reward": 7.714739799499512, + "reward_std": 1.5887811183929443, + "rewards/accuracy_reward/mean": 6.964739799499512, + "rewards/accuracy_reward/std": 1.813651442527771, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 617.390625, + "completions/mean_terminated_length": 617.390625, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.229607250755287, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04569073021411896, + "learning_rate": 2.7706068623466295e-06, + "loss": 0.0371, + "num_tokens": 64505705.0, + "reward": 4.027130126953125, + "reward_std": 2.4192826747894287, + "rewards/accuracy_reward/mean": 3.277129650115967, + "rewards/accuracy_reward/std": 3.7050797939300537, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 553.09375, + "completions/mean_terminated_length": 553.09375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.23021148036253777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047406382858753204, + "learning_rate": 2.769100135364618e-06, + "loss": -0.0045, + "num_tokens": 64668559.0, + "reward": 6.649537086486816, + "reward_std": 2.6833059787750244, + "rewards/accuracy_reward/mean": 5.899537086486816, + "rewards/accuracy_reward/std": 3.0122287273406982, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 501.9375, + "completions/mean_terminated_length": 501.9375, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.2308157099697885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01700427196919918, + "learning_rate": 2.767588938836047e-06, + "loss": -0.0015, + "num_tokens": 64818043.0, + "reward": 4.204959869384766, + "reward_std": 0.7920815944671631, + "rewards/accuracy_reward/mean": 3.4588658809661865, + "rewards/accuracy_reward/std": 3.7785544395446777, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 644.640625, + "completions/mean_terminated_length": 644.640625, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.23141993957703927, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03415811061859131, + "learning_rate": 2.766073278796447e-06, + "loss": 0.0108, + "num_tokens": 64968436.0, + "reward": 5.521308898925781, + "reward_std": 1.4820671081542969, + "rewards/accuracy_reward/mean": 4.771308898925781, + "rewards/accuracy_reward/std": 3.606571674346924, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 578.140625, + "completions/mean_terminated_length": 578.140625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.23202416918429003, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04426858201622963, + "learning_rate": 2.7645531612991763e-06, + "loss": -0.0057, + "num_tokens": 65105037.0, + "reward": 3.6274635791778564, + "reward_std": 1.5602288246154785, + "rewards/accuracy_reward/mean": 2.8774635791778564, + "rewards/accuracy_reward/std": 3.6454968452453613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 584.15625, + "completions/mean_terminated_length": 584.15625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.2326283987915408, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03901573643088341, + "learning_rate": 2.7630285924153943e-06, + "loss": -0.0057, + "num_tokens": 65324791.0, + "reward": 4.844162940979004, + "reward_std": 1.8299763202667236, + "rewards/accuracy_reward/mean": 4.094162940979004, + "rewards/accuracy_reward/std": 3.6519649028778076, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 594.609375, + "completions/mean_terminated_length": 594.609375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.23323262839879155, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027801280841231346, + "learning_rate": 2.7614995782340387e-06, + "loss": 0.0022, + "num_tokens": 65547070.0, + "reward": 5.608233451843262, + "reward_std": 0.9536074995994568, + "rewards/accuracy_reward/mean": 4.858234405517578, + "rewards/accuracy_reward/std": 3.5163233280181885, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1133.0, + "completions/max_terminated_length": 1133.0, + "completions/mean_length": 649.34375, + "completions/mean_terminated_length": 649.34375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.2338368580060423, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02071204036474228, + "learning_rate": 2.7599661248618016e-06, + "loss": 0.0046, + "num_tokens": 65689620.0, + "reward": 2.3662657737731934, + "reward_std": 0.6995797753334045, + "rewards/accuracy_reward/mean": 1.616265892982483, + "rewards/accuracy_reward/std": 3.119234323501587, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 575.828125, + "completions/mean_terminated_length": 575.828125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.23444108761329305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03797253221273422, + "learning_rate": 2.758428238423106e-06, + "loss": -0.0161, + "num_tokens": 65838649.0, + "reward": 5.805190086364746, + "reward_std": 1.9327253103256226, + "rewards/accuracy_reward/mean": 5.055190086364746, + "rewards/accuracy_reward/std": 3.4521913528442383, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 503.140625, + "completions/mean_terminated_length": 503.140625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.2350453172205438, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02534298412501812, + "learning_rate": 2.756885925060078e-06, + "loss": -0.0218, + "num_tokens": 65997586.0, + "reward": 3.0558464527130127, + "reward_std": 1.0760294198989868, + "rewards/accuracy_reward/mean": 2.3058464527130127, + "rewards/accuracy_reward/std": 3.3734004497528076, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 646.59375, + "completions/mean_terminated_length": 646.59375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.23564954682779457, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04672691598534584, + "learning_rate": 2.7553391909325254e-06, + "loss": 0.0078, + "num_tokens": 66175992.0, + "reward": 4.101538181304932, + "reward_std": 1.9388151168823242, + "rewards/accuracy_reward/mean": 3.3515381813049316, + "rewards/accuracy_reward/std": 3.7361721992492676, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 465.65625, + "completions/mean_terminated_length": 465.65625, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.2362537764350453, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.014006305485963821, + "learning_rate": 2.7537880422179105e-06, + "loss": -0.0006, + "num_tokens": 66378514.0, + "reward": 2.5081138610839844, + "reward_std": 0.4355536103248596, + "rewards/accuracy_reward/mean": 1.7581140995025635, + "rewards/accuracy_reward/std": 3.1847729682922363, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 495.265625, + "completions/mean_terminated_length": 495.265625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.23685800604229607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03343105688691139, + "learning_rate": 2.7522324851113294e-06, + "loss": 0.013, + "num_tokens": 66552275.0, + "reward": 7.446891784667969, + "reward_std": 1.8701642751693726, + "rewards/accuracy_reward/mean": 6.700798034667969, + "rewards/accuracy_reward/std": 2.1746902465820312, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 526.296875, + "completions/mean_terminated_length": 526.296875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.23746223564954683, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038363322615623474, + "learning_rate": 2.7506725258254835e-06, + "loss": -0.0103, + "num_tokens": 66692134.0, + "reward": 6.325927734375, + "reward_std": 2.042163133621216, + "rewards/accuracy_reward/mean": 5.575927257537842, + "rewards/accuracy_reward/std": 3.267608404159546, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1100.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 595.515625, + "completions/mean_terminated_length": 595.515625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.2380664652567976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04012203961610794, + "learning_rate": 2.749108170590655e-06, + "loss": 0.0225, + "num_tokens": 66819319.0, + "reward": 3.8521828651428223, + "reward_std": 1.5667014122009277, + "rewards/accuracy_reward/mean": 3.102182388305664, + "rewards/accuracy_reward/std": 3.8125481605529785, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 526.140625, + "completions/mean_terminated_length": 526.140625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.23867069486404835, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02602400816977024, + "learning_rate": 2.7475394256546846e-06, + "loss": 0.0098, + "num_tokens": 66957456.0, + "reward": 1.0799484252929688, + "reward_std": 0.8054669499397278, + "rewards/accuracy_reward/mean": 0.32994842529296875, + "rewards/accuracy_reward/std": 1.5790510177612305, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 533.078125, + "completions/mean_terminated_length": 533.078125, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.23927492447129908, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03723419830203056, + "learning_rate": 2.745966297282944e-06, + "loss": 0.0006, + "num_tokens": 67113989.0, + "reward": 3.3259968757629395, + "reward_std": 1.0555139780044556, + "rewards/accuracy_reward/mean": 2.5759968757629395, + "rewards/accuracy_reward/std": 3.5104329586029053, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 531.515625, + "completions/mean_terminated_length": 531.515625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.23987915407854984, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05018901452422142, + "learning_rate": 2.744388791758311e-06, + "loss": 0.0524, + "num_tokens": 67276966.0, + "reward": 2.6442418098449707, + "reward_std": 2.539860963821411, + "rewards/accuracy_reward/mean": 1.8942415714263916, + "rewards/accuracy_reward/std": 3.093477249145508, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1227.0, + "completions/max_terminated_length": 1227.0, + "completions/mean_length": 617.4375, + "completions/mean_terminated_length": 617.4375, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.2404833836858006, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05526130646467209, + "learning_rate": 2.7428069153811483e-06, + "loss": 0.0126, + "num_tokens": 67407906.0, + "reward": 2.635937452316284, + "reward_std": 2.2716708183288574, + "rewards/accuracy_reward/mean": 1.8859374523162842, + "rewards/accuracy_reward/std": 3.3286306858062744, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 571.65625, + "completions/mean_terminated_length": 571.65625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.24108761329305137, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02349836193025112, + "learning_rate": 2.741220674469271e-06, + "loss": 0.0055, + "num_tokens": 67542732.0, + "reward": 4.766990661621094, + "reward_std": 0.7732627987861633, + "rewards/accuracy_reward/mean": 4.016990661621094, + "rewards/accuracy_reward/std": 3.7194714546203613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 623.53125, + "completions/mean_terminated_length": 623.53125, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.24169184290030213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036610525101423264, + "learning_rate": 2.739630075357929e-06, + "loss": -0.0095, + "num_tokens": 67715022.0, + "reward": 5.983050346374512, + "reward_std": 1.3320873975753784, + "rewards/accuracy_reward/mean": 5.233050346374512, + "rewards/accuracy_reward/std": 3.4640607833862305, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 578.109375, + "completions/mean_terminated_length": 578.109375, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "epoch": 0.24229607250755286, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03501831740140915, + "learning_rate": 2.7380351243997765e-06, + "loss": 0.0185, + "num_tokens": 67872613.0, + "reward": 7.494234561920166, + "reward_std": 1.3766331672668457, + "rewards/accuracy_reward/mean": 6.744234561920166, + "rewards/accuracy_reward/std": 2.186640977859497, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 565.296875, + "completions/mean_terminated_length": 565.296875, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.24290030211480362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030770402401685715, + "learning_rate": 2.7364358279648495e-06, + "loss": -0.0093, + "num_tokens": 68004472.0, + "reward": 5.83440637588501, + "reward_std": 0.9546550512313843, + "rewards/accuracy_reward/mean": 5.088313102722168, + "rewards/accuracy_reward/std": 3.516470193862915, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 565.59375, + "completions/mean_terminated_length": 565.59375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.24350453172205438, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023572171106934547, + "learning_rate": 2.7348321924405384e-06, + "loss": -0.006, + "num_tokens": 68165758.0, + "reward": 2.738420248031616, + "reward_std": 0.9742423892021179, + "rewards/accuracy_reward/mean": 1.9884201288223267, + "rewards/accuracy_reward/std": 3.181776285171509, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 450.328125, + "completions/mean_terminated_length": 450.328125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.24410876132930515, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03960014879703522, + "learning_rate": 2.7332242242315637e-06, + "loss": 0.0584, + "num_tokens": 68330723.0, + "reward": 7.031972885131836, + "reward_std": 1.710128664970398, + "rewards/accuracy_reward/mean": 6.281972885131836, + "rewards/accuracy_reward/std": 2.549539804458618, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1350.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 596.390625, + "completions/mean_terminated_length": 596.390625, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.24471299093655588, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05281181260943413, + "learning_rate": 2.7316119297599505e-06, + "loss": 0.0024, + "num_tokens": 68455500.0, + "reward": 3.6700515747070312, + "reward_std": 2.423677921295166, + "rewards/accuracy_reward/mean": 2.9200515747070312, + "rewards/accuracy_reward/std": 3.6464459896087646, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 421.28125, + "completions/mean_terminated_length": 421.28125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.24531722054380664, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.017943305894732475, + "learning_rate": 2.7299953154650018e-06, + "loss": -0.0046, + "num_tokens": 68594350.0, + "reward": 2.8398282527923584, + "reward_std": 0.6458674073219299, + "rewards/accuracy_reward/mean": 2.0898284912109375, + "rewards/accuracy_reward/std": 3.3672804832458496, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1407.0, + "completions/max_terminated_length": 1407.0, + "completions/mean_length": 591.921875, + "completions/mean_terminated_length": 591.921875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.2459214501510574, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04836684465408325, + "learning_rate": 2.7283743878032735e-06, + "loss": -0.0219, + "num_tokens": 68757433.0, + "reward": 4.362435817718506, + "reward_std": 1.9642586708068848, + "rewards/accuracy_reward/mean": 3.616342067718506, + "rewards/accuracy_reward/std": 3.8115344047546387, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 551.5625, + "completions/mean_terminated_length": 551.5625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.24652567975830816, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03729773312807083, + "learning_rate": 2.726749153248549e-06, + "loss": 0.0333, + "num_tokens": 68920861.0, + "reward": 5.092641353607178, + "reward_std": 1.3783550262451172, + "rewards/accuracy_reward/mean": 4.342641353607178, + "rewards/accuracy_reward/std": 3.640113353729248, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 595.890625, + "completions/mean_terminated_length": 595.890625, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.24712990936555893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03614754229784012, + "learning_rate": 2.7251196182918136e-06, + "loss": -0.0075, + "num_tokens": 69081558.0, + "reward": 6.686637878417969, + "reward_std": 1.2982515096664429, + "rewards/accuracy_reward/mean": 5.936637878417969, + "rewards/accuracy_reward/std": 3.003369092941284, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 556.84375, + "completions/mean_terminated_length": 556.84375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.24773413897280966, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047166772186756134, + "learning_rate": 2.7234857894412257e-06, + "loss": 0.032, + "num_tokens": 69241740.0, + "reward": 4.556282043457031, + "reward_std": 1.8165283203125, + "rewards/accuracy_reward/mean": 3.8101887702941895, + "rewards/accuracy_reward/std": 3.670964241027832, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 673.75, + "completions/mean_terminated_length": 651.9365234375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.24833836858006042, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027966484427452087, + "learning_rate": 2.7218476732220945e-06, + "loss": 0.0364, + "num_tokens": 69399004.0, + "reward": 4.836148262023926, + "reward_std": 0.8314657211303711, + "rewards/accuracy_reward/mean": 4.090054512023926, + "rewards/accuracy_reward/std": 3.717320203781128, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 620.875, + "completions/mean_terminated_length": 620.875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.24894259818731118, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027016334235668182, + "learning_rate": 2.720205276176853e-06, + "loss": 0.0104, + "num_tokens": 69620756.0, + "reward": 1.6601476669311523, + "reward_std": 1.0675016641616821, + "rewards/accuracy_reward/mean": 0.9140540361404419, + "rewards/accuracy_reward/std": 2.3147552013397217, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 582.5, + "completions/mean_terminated_length": 582.5, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.24954682779456194, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03903573751449585, + "learning_rate": 2.7185586048650297e-06, + "loss": -0.0176, + "num_tokens": 69781652.0, + "reward": 1.6622406244277954, + "reward_std": 1.7167623043060303, + "rewards/accuracy_reward/mean": 0.9161468744277954, + "rewards/accuracy_reward/std": 2.5021677017211914, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 511.71875, + "completions/mean_terminated_length": 511.71875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.2501510574018127, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03137747570872307, + "learning_rate": 2.7169076658632243e-06, + "loss": -0.0155, + "num_tokens": 69916610.0, + "reward": 2.9026904106140137, + "reward_std": 1.6683272123336792, + "rewards/accuracy_reward/mean": 2.1526906490325928, + "rewards/accuracy_reward/std": 3.4690475463867188, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1234.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 621.71875, + "completions/mean_terminated_length": 621.71875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.25075528700906347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0468265525996685, + "learning_rate": 2.7152524657650824e-06, + "loss": 0.0186, + "num_tokens": 70095648.0, + "reward": 2.42647647857666, + "reward_std": 1.8191297054290771, + "rewards/accuracy_reward/mean": 1.6764764785766602, + "rewards/accuracy_reward/std": 3.043949604034424, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 620.546875, + "completions/mean_terminated_length": 620.546875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.2513595166163142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05642234534025192, + "learning_rate": 2.713593011181267e-06, + "loss": 0.0402, + "num_tokens": 70235443.0, + "reward": 4.78842830657959, + "reward_std": 2.6079258918762207, + "rewards/accuracy_reward/mean": 4.03842830657959, + "rewards/accuracy_reward/std": 3.747454881668091, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 524.046875, + "completions/mean_terminated_length": 524.046875, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.25196374622356493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036830365657806396, + "learning_rate": 2.7119293087394325e-06, + "loss": 0.0132, + "num_tokens": 70415446.0, + "reward": 6.516267776489258, + "reward_std": 2.168165683746338, + "rewards/accuracy_reward/mean": 5.7701735496521, + "rewards/accuracy_reward/std": 3.118441343307495, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 513.9375, + "completions/mean_terminated_length": 513.9375, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.2525679758308157, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.019437026232481003, + "learning_rate": 2.7102613650841994e-06, + "loss": -0.0011, + "num_tokens": 70570002.0, + "reward": 4.311394214630127, + "reward_std": 0.5279185175895691, + "rewards/accuracy_reward/mean": 3.561394453048706, + "rewards/accuracy_reward/std": 3.7374603748321533, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 612.21875, + "completions/mean_terminated_length": 541.6065063476562, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.25317220543806646, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.039134763181209564, + "learning_rate": 2.7085891868771273e-06, + "loss": -0.0201, + "num_tokens": 70763344.0, + "reward": 1.9860488176345825, + "reward_std": 1.6701476573944092, + "rewards/accuracy_reward/mean": 1.271205186843872, + "rewards/accuracy_reward/std": 2.82194447517395, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 630.578125, + "completions/mean_terminated_length": 608.0794067382812, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.2537764350453172, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05450756475329399, + "learning_rate": 2.706912780796687e-06, + "loss": -0.0422, + "num_tokens": 70893429.0, + "reward": 3.9140610694885254, + "reward_std": 2.7685046195983887, + "rewards/accuracy_reward/mean": 3.1757798194885254, + "rewards/accuracy_reward/std": 3.6830694675445557, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 584.734375, + "completions/mean_terminated_length": 561.5079956054688, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.254380664652568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029588233679533005, + "learning_rate": 2.7052321535382365e-06, + "loss": -0.0015, + "num_tokens": 71051284.0, + "reward": 2.922788619995117, + "reward_std": 1.5015074014663696, + "rewards/accuracy_reward/mean": 2.184507369995117, + "rewards/accuracy_reward/std": 3.4755642414093018, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 1206.0, + "completions/mean_length": 546.71875, + "completions/mean_terminated_length": 546.71875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.25498489425981874, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04892963543534279, + "learning_rate": 2.7035473118139913e-06, + "loss": 0.0367, + "num_tokens": 71197634.0, + "reward": 2.3423681259155273, + "reward_std": 2.473632335662842, + "rewards/accuracy_reward/mean": 1.5923681259155273, + "rewards/accuracy_reward/std": 3.0989723205566406, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 541.671875, + "completions/mean_terminated_length": 541.671875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.2555891238670695, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050377216190099716, + "learning_rate": 2.701858262352999e-06, + "loss": -0.0083, + "num_tokens": 71350381.0, + "reward": 4.551070690155029, + "reward_std": 1.8418408632278442, + "rewards/accuracy_reward/mean": 3.8010706901550293, + "rewards/accuracy_reward/std": 3.813474178314209, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1484.0, + "completions/max_terminated_length": 1484.0, + "completions/mean_length": 679.203125, + "completions/mean_terminated_length": 679.203125, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.25619335347432026, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03350657969713211, + "learning_rate": 2.7001650119011137e-06, + "loss": 0.0082, + "num_tokens": 71515514.0, + "reward": 3.1820414066314697, + "reward_std": 1.3149927854537964, + "rewards/accuracy_reward/mean": 2.4320414066314697, + "rewards/accuracy_reward/std": 3.507869243621826, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 618.53125, + "completions/mean_terminated_length": 618.53125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.256797583081571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0408954992890358, + "learning_rate": 2.6984675672209658e-06, + "loss": 0.0197, + "num_tokens": 71679452.0, + "reward": 4.087068557739258, + "reward_std": 1.4120824337005615, + "rewards/accuracy_reward/mean": 3.337068557739258, + "rewards/accuracy_reward/std": 3.64811110496521, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 545.40625, + "completions/mean_terminated_length": 545.40625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.25740181268882173, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026873044669628143, + "learning_rate": 2.6967659350919386e-06, + "loss": 0.0015, + "num_tokens": 71861542.0, + "reward": 2.9322402477264404, + "reward_std": 1.3837058544158936, + "rewards/accuracy_reward/mean": 2.1822402477264404, + "rewards/accuracy_reward/std": 3.458317756652832, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 624.171875, + "completions/mean_terminated_length": 624.171875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.2580060422960725, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003909233491867781, + "learning_rate": 2.6950601223101384e-06, + "loss": -0.0012, + "num_tokens": 72046145.0, + "reward": 0.7919703125953674, + "reward_std": 0.1537153124809265, + "rewards/accuracy_reward/mean": 0.04978281259536743, + "rewards/accuracy_reward/std": 0.2108035683631897, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.0625, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 649.875, + "completions/mean_terminated_length": 627.6825561523438, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.25861027190332325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037263333797454834, + "learning_rate": 2.6933501356883697e-06, + "loss": -0.0125, + "num_tokens": 72204713.0, + "reward": 5.252632141113281, + "reward_std": 1.4149038791656494, + "rewards/accuracy_reward/mean": 4.514350891113281, + "rewards/accuracy_reward/std": 3.6776068210601807, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 715.984375, + "completions/mean_terminated_length": 694.84130859375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.259214501510574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05011865496635437, + "learning_rate": 2.6916359820561054e-06, + "loss": 0.0012, + "num_tokens": 72369160.0, + "reward": 3.513594627380371, + "reward_std": 2.3615574836730957, + "rewards/accuracy_reward/mean": 2.77531361579895, + "rewards/accuracy_reward/std": 3.5319957733154297, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 507.015625, + "completions/mean_terminated_length": 507.015625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.2598187311178248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048424169421195984, + "learning_rate": 2.689917668259462e-06, + "loss": -0.0494, + "num_tokens": 72543657.0, + "reward": 4.052964210510254, + "reward_std": 2.6424996852874756, + "rewards/accuracy_reward/mean": 3.314683198928833, + "rewards/accuracy_reward/std": 3.7579541206359863, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1045.0, + "completions/max_terminated_length": 1045.0, + "completions/mean_length": 615.265625, + "completions/mean_terminated_length": 615.265625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.26042296072507554, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035390015691518784, + "learning_rate": 2.688195201161171e-06, + "loss": -0.0085, + "num_tokens": 72697594.0, + "reward": 5.206467151641846, + "reward_std": 1.995316743850708, + "rewards/accuracy_reward/mean": 4.456467151641846, + "rewards/accuracy_reward/std": 3.6553211212158203, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1927.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 758.078125, + "completions/mean_terminated_length": 758.078125, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.2610271903323263, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03312918543815613, + "learning_rate": 2.686468587640551e-06, + "loss": 0.0136, + "num_tokens": 72885327.0, + "reward": 3.9320766925811768, + "reward_std": 0.9714311361312866, + "rewards/accuracy_reward/mean": 3.185983180999756, + "rewards/accuracy_reward/std": 3.742751359939575, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 622.6875, + "completions/mean_terminated_length": 622.6875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.26163141993957706, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004088149406015873, + "learning_rate": 2.6847378345934814e-06, + "loss": -0.0004, + "num_tokens": 73136395.0, + "reward": 4.364225387573242, + "reward_std": 0.13663916289806366, + "rewards/accuracy_reward/mean": 3.618131637573242, + "rewards/accuracy_reward/std": 3.715705156326294, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 629.359375, + "completions/mean_terminated_length": 629.359375, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.2622356495468278, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.041975073516368866, + "learning_rate": 2.683002948932374e-06, + "loss": 0.0224, + "num_tokens": 73306114.0, + "reward": 2.583590507507324, + "reward_std": 1.8978196382522583, + "rewards/accuracy_reward/mean": 1.8335906267166138, + "rewards/accuracy_reward/std": 3.278383493423462, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 540.125, + "completions/mean_terminated_length": 540.125, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.2628398791540785, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044430095702409744, + "learning_rate": 2.6812639375861472e-06, + "loss": 0.0006, + "num_tokens": 73458810.0, + "reward": 6.8560404777526855, + "reward_std": 1.6851288080215454, + "rewards/accuracy_reward/mean": 6.1060404777526855, + "rewards/accuracy_reward/std": 2.8922669887542725, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 527.8125, + "completions/mean_terminated_length": 527.8125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.2634441087613293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05103413760662079, + "learning_rate": 2.679520807500195e-06, + "loss": -0.001, + "num_tokens": 73599838.0, + "reward": 6.898777008056641, + "reward_std": 2.230992555618286, + "rewards/accuracy_reward/mean": 6.148777008056641, + "rewards/accuracy_reward/std": 2.8680543899536133, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 500.828125, + "completions/mean_terminated_length": 500.828125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.26404833836858005, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03114660270512104, + "learning_rate": 2.6777735656363616e-06, + "loss": -0.0005, + "num_tokens": 73752387.0, + "reward": 4.199892997741699, + "reward_std": 1.5443193912506104, + "rewards/accuracy_reward/mean": 3.449892520904541, + "rewards/accuracy_reward/std": 3.7019190788269043, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 579.578125, + "completions/mean_terminated_length": 579.578125, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.2646525679758308, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.000577275815885514, + "learning_rate": 2.6760222189729137e-06, + "loss": -0.0005, + "num_tokens": 73923368.0, + "reward": 6.320572853088379, + "reward_std": 0.03227253258228302, + "rewards/accuracy_reward/mean": 5.570572853088379, + "rewards/accuracy_reward/std": 3.241827964782715, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 591.96875, + "completions/mean_terminated_length": 591.96875, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.26525679758308157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04033992439508438, + "learning_rate": 2.6742667745045114e-06, + "loss": 0.0071, + "num_tokens": 74095574.0, + "reward": 5.141820907592773, + "reward_std": 2.3740718364715576, + "rewards/accuracy_reward/mean": 4.391820907592773, + "rewards/accuracy_reward/std": 3.5103580951690674, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 574.1875, + "completions/mean_terminated_length": 574.1875, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.26586102719033233, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03862234577536583, + "learning_rate": 2.672507239242182e-06, + "loss": 0.0188, + "num_tokens": 74228962.0, + "reward": 3.123460054397583, + "reward_std": 1.5212461948394775, + "rewards/accuracy_reward/mean": 2.373460292816162, + "rewards/accuracy_reward/std": 3.514248847961426, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1927.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 832.0, + "completions/mean_terminated_length": 832.0, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "epoch": 0.2664652567975831, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024363670498132706, + "learning_rate": 2.6707436202132896e-06, + "loss": -0.0032, + "num_tokens": 74402898.0, + "reward": 4.13701868057251, + "reward_std": 1.1002719402313232, + "rewards/accuracy_reward/mean": 3.390925168991089, + "rewards/accuracy_reward/std": 3.726591110229492, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 557.53125, + "completions/mean_terminated_length": 557.53125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.26706948640483386, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04967978596687317, + "learning_rate": 2.6689759244615094e-06, + "loss": -0.0046, + "num_tokens": 74561476.0, + "reward": 3.730961799621582, + "reward_std": 1.7251052856445312, + "rewards/accuracy_reward/mean": 2.980961799621582, + "rewards/accuracy_reward/std": 3.7007219791412354, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 607.890625, + "completions/mean_terminated_length": 607.890625, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.2676737160120846, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03361257538199425, + "learning_rate": 2.667204159046797e-06, + "loss": 0.0088, + "num_tokens": 74715245.0, + "reward": 3.8790557384490967, + "reward_std": 0.9656737446784973, + "rewards/accuracy_reward/mean": 3.132962226867676, + "rewards/accuracy_reward/std": 3.7164242267608643, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1408.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 634.1875, + "completions/mean_terminated_length": 634.1875, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.2682779456193353, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.018171699717640877, + "learning_rate": 2.6654283310453644e-06, + "loss": 0.0029, + "num_tokens": 74884937.0, + "reward": 6.322062015533447, + "reward_std": 0.5506852865219116, + "rewards/accuracy_reward/mean": 5.572061538696289, + "rewards/accuracy_reward/std": 3.151312828063965, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 606.875, + "completions/mean_terminated_length": 606.875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.2688821752265861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.038995012640953064, + "learning_rate": 2.663648447549646e-06, + "loss": 0.0, + "num_tokens": 75050449.0, + "reward": 3.494199752807617, + "reward_std": 1.5041486024856567, + "rewards/accuracy_reward/mean": 2.744199514389038, + "rewards/accuracy_reward/std": 3.561009645462036, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 514.65625, + "completions/mean_terminated_length": 514.65625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.26948640483383685, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05326542630791664, + "learning_rate": 2.661864515668276e-06, + "loss": -0.0106, + "num_tokens": 75249371.0, + "reward": 4.441043853759766, + "reward_std": 2.9623594284057617, + "rewards/accuracy_reward/mean": 3.6949501037597656, + "rewards/accuracy_reward/std": 3.7139551639556885, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 563.78125, + "completions/mean_terminated_length": 563.78125, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.2700906344410876, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.030433528125286102, + "learning_rate": 2.6600765425260557e-06, + "loss": -0.0291, + "num_tokens": 75395869.0, + "reward": 3.6552515029907227, + "reward_std": 0.9599775671958923, + "rewards/accuracy_reward/mean": 2.9052515029907227, + "rewards/accuracy_reward/std": 3.6573984622955322, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 538.953125, + "completions/mean_terminated_length": 538.953125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.27069486404833837, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05743812397122383, + "learning_rate": 2.6582845352639265e-06, + "loss": 0.0351, + "num_tokens": 75534602.0, + "reward": 4.2267656326293945, + "reward_std": 2.533564567565918, + "rewards/accuracy_reward/mean": 3.4767656326293945, + "rewards/accuracy_reward/std": 3.7052271366119385, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 485.265625, + "completions/mean_terminated_length": 485.265625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.27129909365558913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03951618820428848, + "learning_rate": 2.6564885010389428e-06, + "loss": 0.0049, + "num_tokens": 75767867.0, + "reward": 5.046546459197998, + "reward_std": 1.8471548557281494, + "rewards/accuracy_reward/mean": 4.296546936035156, + "rewards/accuracy_reward/std": 3.574413537979126, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 592.5625, + "completions/mean_terminated_length": 592.5625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.2719033232628399, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024829935282468796, + "learning_rate": 2.6546884470242415e-06, + "loss": 0.0072, + "num_tokens": 75934719.0, + "reward": 4.193317413330078, + "reward_std": 1.1854702234268188, + "rewards/accuracy_reward/mean": 3.4433178901672363, + "rewards/accuracy_reward/std": 3.691708564758301, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 616.796875, + "completions/mean_terminated_length": 616.796875, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.27250755287009065, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.046247001737356186, + "learning_rate": 2.6528843804090146e-06, + "loss": -0.0141, + "num_tokens": 76062130.0, + "reward": 4.181512355804443, + "reward_std": 1.9258592128753662, + "rewards/accuracy_reward/mean": 3.4315125942230225, + "rewards/accuracy_reward/std": 3.7440712451934814, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 546.609375, + "completions/mean_terminated_length": 546.609375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.2731117824773414, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019276558887213469, + "learning_rate": 2.651076308398479e-06, + "loss": 0.001, + "num_tokens": 76211529.0, + "reward": 2.6945700645446777, + "reward_std": 0.06256815791130066, + "rewards/accuracy_reward/mean": 1.9484763145446777, + "rewards/accuracy_reward/std": 3.2034049034118652, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 594.21875, + "completions/mean_terminated_length": 594.21875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.2737160120845921, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.010631216689944267, + "learning_rate": 2.649264238213849e-06, + "loss": 0.0044, + "num_tokens": 76364343.0, + "reward": 0.8651062250137329, + "reward_std": 0.5262972712516785, + "rewards/accuracy_reward/mean": 0.1151062399148941, + "rewards/accuracy_reward/std": 0.9440973997116089, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 534.0, + "completions/mean_terminated_length": 534.0, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.2743202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04203030467033386, + "learning_rate": 2.6474481770923075e-06, + "loss": -0.0064, + "num_tokens": 76531319.0, + "reward": 5.61978006362915, + "reward_std": 2.2349166870117188, + "rewards/accuracy_reward/mean": 4.86978006362915, + "rewards/accuracy_reward/std": 3.5877480506896973, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 557.375, + "completions/mean_terminated_length": 557.375, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.27492447129909364, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.026796918362379074, + "learning_rate": 2.6456281322869766e-06, + "loss": -0.0099, + "num_tokens": 76689439.0, + "reward": 3.9718410968780518, + "reward_std": 0.8408603668212891, + "rewards/accuracy_reward/mean": 3.2218410968780518, + "rewards/accuracy_reward/std": 3.6825292110443115, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 796.234375, + "completions/mean_terminated_length": 776.3651123046875, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.2755287009063444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03629680350422859, + "learning_rate": 2.643804111066888e-06, + "loss": -0.0225, + "num_tokens": 76892782.0, + "reward": 1.4904985427856445, + "reward_std": 1.1410439014434814, + "rewards/accuracy_reward/mean": 0.752217173576355, + "rewards/accuracy_reward/std": 2.1673431396484375, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1267.0, + "completions/max_terminated_length": 1267.0, + "completions/mean_length": 564.265625, + "completions/mean_terminated_length": 564.265625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.27613293051359517, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.048066504299640656, + "learning_rate": 2.6419761207169554e-06, + "loss": -0.0191, + "num_tokens": 77054543.0, + "reward": 4.89056396484375, + "reward_std": 2.3106813430786133, + "rewards/accuracy_reward/mean": 4.140564441680908, + "rewards/accuracy_reward/std": 3.7814576625823975, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 502.375, + "completions/mean_terminated_length": 502.375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.2767371601208459, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04359523206949234, + "learning_rate": 2.6401441685379456e-06, + "loss": 0.0195, + "num_tokens": 77201895.0, + "reward": 3.6887619495391846, + "reward_std": 2.507988929748535, + "rewards/accuracy_reward/mean": 2.9426679611206055, + "rewards/accuracy_reward/std": 3.6962366104125977, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 545.46875, + "completions/mean_terminated_length": 545.46875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.2773413897280967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036934029310941696, + "learning_rate": 2.638308261846446e-06, + "loss": 0.0026, + "num_tokens": 77359685.0, + "reward": 6.536348819732666, + "reward_std": 1.533727765083313, + "rewards/accuracy_reward/mean": 5.794161319732666, + "rewards/accuracy_reward/std": 3.131126642227173, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.0625, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 580.21875, + "completions/mean_terminated_length": 580.21875, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.27794561933534745, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04615308716893196, + "learning_rate": 2.63646840797484e-06, + "loss": 0.0211, + "num_tokens": 77519939.0, + "reward": 4.031771659851074, + "reward_std": 2.330374002456665, + "rewards/accuracy_reward/mean": 3.281771421432495, + "rewards/accuracy_reward/std": 3.699817657470703, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 520.109375, + "completions/mean_terminated_length": 520.109375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.2785498489425982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.014926073141396046, + "learning_rate": 2.6346246142712744e-06, + "loss": -0.0029, + "num_tokens": 77657146.0, + "reward": 6.251119613647461, + "reward_std": 0.5528228282928467, + "rewards/accuracy_reward/mean": 5.501119613647461, + "rewards/accuracy_reward/std": 3.2384722232818604, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 557.46875, + "completions/mean_terminated_length": 557.46875, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.2791540785498489, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04110335186123848, + "learning_rate": 2.6327768880996323e-06, + "loss": 0.0049, + "num_tokens": 77802264.0, + "reward": 4.493711948394775, + "reward_std": 1.6617701053619385, + "rewards/accuracy_reward/mean": 3.7437119483947754, + "rewards/accuracy_reward/std": 3.730848550796509, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1223.0, + "completions/max_terminated_length": 1223.0, + "completions/mean_length": 674.5, + "completions/mean_terminated_length": 674.5, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.2797583081570997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049092214554548264, + "learning_rate": 2.6309252368395013e-06, + "loss": 0.0022, + "num_tokens": 77939480.0, + "reward": 5.239736080169678, + "reward_std": 2.215193271636963, + "rewards/accuracy_reward/mean": 4.489736080169678, + "rewards/accuracy_reward/std": 3.6628549098968506, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 523.515625, + "completions/mean_terminated_length": 499.3174743652344, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.28036253776435044, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04745474457740784, + "learning_rate": 2.6290696678861465e-06, + "loss": -0.0132, + "num_tokens": 78154841.0, + "reward": 3.2391467094421387, + "reward_std": 2.4453039169311523, + "rewards/accuracy_reward/mean": 2.5008654594421387, + "rewards/accuracy_reward/std": 3.553781270980835, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 523.578125, + "completions/mean_terminated_length": 523.578125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.2809667673716012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051617883145809174, + "learning_rate": 2.6272101886504787e-06, + "loss": 0.0146, + "num_tokens": 78335150.0, + "reward": 6.286848068237305, + "reward_std": 2.3947179317474365, + "rewards/accuracy_reward/mean": 5.540754318237305, + "rewards/accuracy_reward/std": 3.301571846008301, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 574.6875, + "completions/mean_terminated_length": 574.6875, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.28157099697885196, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.019487226381897926, + "learning_rate": 2.625346806559026e-06, + "loss": 0.0045, + "num_tokens": 78480570.0, + "reward": 4.235444068908691, + "reward_std": 0.6225156188011169, + "rewards/accuracy_reward/mean": 3.4854443073272705, + "rewards/accuracy_reward/std": 3.7267212867736816, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 586.53125, + "completions/mean_terminated_length": 563.3333740234375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.2821752265861027, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056274428963661194, + "learning_rate": 2.623479529053905e-06, + "loss": 0.0253, + "num_tokens": 78629836.0, + "reward": 4.459820747375488, + "reward_std": 2.844923496246338, + "rewards/accuracy_reward/mean": 3.7215397357940674, + "rewards/accuracy_reward/std": 3.763864517211914, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2010.0, + "completions/max_terminated_length": 1220.0, + "completions/mean_length": 638.625, + "completions/mean_terminated_length": 616.857177734375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.2827794561933535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.031863536685705185, + "learning_rate": 2.6216083635927896e-06, + "loss": -0.0437, + "num_tokens": 78787946.0, + "reward": 4.513643264770508, + "reward_std": 1.475118637084961, + "rewards/accuracy_reward/mean": 3.775362014770508, + "rewards/accuracy_reward/std": 3.678476095199585, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 508.96875, + "completions/mean_terminated_length": 508.96875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.28338368580060425, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04587899148464203, + "learning_rate": 2.6197333176488816e-06, + "loss": 0.006, + "num_tokens": 78924840.0, + "reward": 2.4552886486053467, + "reward_std": 1.9947832822799683, + "rewards/accuracy_reward/mean": 1.7052885293960571, + "rewards/accuracy_reward/std": 3.2087836265563965, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 484.734375, + "completions/mean_terminated_length": 484.734375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.283987915407855, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034977372735738754, + "learning_rate": 2.617854398710881e-06, + "loss": 0.0031, + "num_tokens": 79042055.0, + "reward": 3.9277772903442383, + "reward_std": 1.7928844690322876, + "rewards/accuracy_reward/mean": 3.1777772903442383, + "rewards/accuracy_reward/std": 3.7331113815307617, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1802.0, + "completions/max_terminated_length": 1802.0, + "completions/mean_length": 605.0, + "completions/mean_terminated_length": 605.0, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.2845921450151057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032409701496362686, + "learning_rate": 2.615971614282955e-06, + "loss": 0.0091, + "num_tokens": 79205575.0, + "reward": 3.8730742931365967, + "reward_std": 1.822340726852417, + "rewards/accuracy_reward/mean": 3.123074531555176, + "rewards/accuracy_reward/std": 3.5893118381500244, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 556.921875, + "completions/mean_terminated_length": 556.921875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.2851963746223565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03986025229096413, + "learning_rate": 2.614084971884711e-06, + "loss": -0.0017, + "num_tokens": 79428354.0, + "reward": 6.88890266418457, + "reward_std": 2.1956846714019775, + "rewards/accuracy_reward/mean": 6.13890266418457, + "rewards/accuracy_reward/std": 2.7675843238830566, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 533.15625, + "completions/mean_terminated_length": 533.15625, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.28580060422960724, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03985975310206413, + "learning_rate": 2.612194479051164e-06, + "loss": 0.0003, + "num_tokens": 79642348.0, + "reward": 5.874732971191406, + "reward_std": 2.0929148197174072, + "rewards/accuracy_reward/mean": 5.124732971191406, + "rewards/accuracy_reward/std": 3.3420348167419434, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1242.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 646.875, + "completions/mean_terminated_length": 646.875, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.286404833836858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04150468856096268, + "learning_rate": 2.6103001433327065e-06, + "loss": -0.0009, + "num_tokens": 79933124.0, + "reward": 6.183465480804443, + "reward_std": 1.945996642112732, + "rewards/accuracy_reward/mean": 5.433465480804443, + "rewards/accuracy_reward/std": 3.279968500137329, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 581.984375, + "completions/mean_terminated_length": 581.984375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.28700906344410876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047917574644088745, + "learning_rate": 2.6084019722950794e-06, + "loss": 0.0035, + "num_tokens": 80077491.0, + "reward": 4.29939079284668, + "reward_std": 3.0095319747924805, + "rewards/accuracy_reward/mean": 3.5493907928466797, + "rewards/accuracy_reward/std": 3.758546829223633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 717.8125, + "completions/mean_terminated_length": 717.8125, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.2876132930513595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051541365683078766, + "learning_rate": 2.6064999735193415e-06, + "loss": 0.0282, + "num_tokens": 80254615.0, + "reward": 4.13527250289917, + "reward_std": 2.841701030731201, + "rewards/accuracy_reward/mean": 3.38527250289917, + "rewards/accuracy_reward/std": 3.7051165103912354, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 714.109375, + "completions/mean_terminated_length": 714.109375, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.2882175226586103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03585570678114891, + "learning_rate": 2.604594154601839e-06, + "loss": -0.0023, + "num_tokens": 80443102.0, + "reward": 4.310504913330078, + "reward_std": 1.477816104888916, + "rewards/accuracy_reward/mean": 3.560504913330078, + "rewards/accuracy_reward/std": 3.802898406982422, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 489.109375, + "completions/mean_terminated_length": 489.109375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.28882175226586104, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.01589195616543293, + "learning_rate": 2.6026845231541756e-06, + "loss": 0.0029, + "num_tokens": 80592885.0, + "reward": 2.5031015872955322, + "reward_std": 0.4342397153377533, + "rewards/accuracy_reward/mean": 1.7531014680862427, + "rewards/accuracy_reward/std": 3.1756551265716553, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 635.171875, + "completions/mean_terminated_length": 635.171875, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.2894259818731118, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.028838910162448883, + "learning_rate": 2.6007710868031804e-06, + "loss": 0.0032, + "num_tokens": 80730784.0, + "reward": 1.0944437980651855, + "reward_std": 1.08780837059021, + "rewards/accuracy_reward/mean": 0.34444373846054077, + "rewards/accuracy_reward/std": 1.5654855966567993, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 595.015625, + "completions/mean_terminated_length": 595.015625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.29003021148036257, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033331651240587234, + "learning_rate": 2.598853853190882e-06, + "loss": 0.006, + "num_tokens": 80872929.0, + "reward": 3.599351167678833, + "reward_std": 1.396880865097046, + "rewards/accuracy_reward/mean": 2.849351167678833, + "rewards/accuracy_reward/std": 3.5632643699645996, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 581.828125, + "completions/mean_terminated_length": 581.828125, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.29063444108761327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038127947598695755, + "learning_rate": 2.59693282997447e-06, + "loss": 0.0228, + "num_tokens": 81032374.0, + "reward": 5.297410011291504, + "reward_std": 2.5051441192626953, + "rewards/accuracy_reward/mean": 4.547410011291504, + "rewards/accuracy_reward/std": 3.6269564628601074, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1210.0, + "completions/max_terminated_length": 1210.0, + "completions/mean_length": 597.203125, + "completions/mean_terminated_length": 597.203125, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.29123867069486403, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03236446529626846, + "learning_rate": 2.595008024826274e-06, + "loss": -0.0037, + "num_tokens": 81167379.0, + "reward": 4.169747352600098, + "reward_std": 1.994454026222229, + "rewards/accuracy_reward/mean": 3.4197471141815186, + "rewards/accuracy_reward/std": 3.705766439437866, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 560.890625, + "completions/mean_terminated_length": 560.890625, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.2918429003021148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04740156605839729, + "learning_rate": 2.593079445433725e-06, + "loss": -0.0026, + "num_tokens": 81347260.0, + "reward": 4.215717315673828, + "reward_std": 2.6484594345092773, + "rewards/accuracy_reward/mean": 3.46962308883667, + "rewards/accuracy_reward/std": 3.7565953731536865, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 745.078125, + "completions/mean_terminated_length": 724.3968505859375, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.29244712990936556, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03540149703621864, + "learning_rate": 2.5911470994993292e-06, + "loss": -0.0065, + "num_tokens": 81542017.0, + "reward": 3.800131320953369, + "reward_std": 1.4542495012283325, + "rewards/accuracy_reward/mean": 3.0618503093719482, + "rewards/accuracy_reward/std": 3.6401548385620117, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1275.0, + "completions/max_terminated_length": 1275.0, + "completions/mean_length": 734.171875, + "completions/mean_terminated_length": 734.171875, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.2930513595166163, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04970363900065422, + "learning_rate": 2.5892109947406354e-06, + "loss": -0.0236, + "num_tokens": 81749340.0, + "reward": 2.5190985202789307, + "reward_std": 2.8267922401428223, + "rewards/accuracy_reward/mean": 1.7690982818603516, + "rewards/accuracy_reward/std": 3.008981466293335, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 570.640625, + "completions/mean_terminated_length": 570.640625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.2936555891238671, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05901824310421944, + "learning_rate": 2.5872711388902044e-06, + "loss": -0.0188, + "num_tokens": 81975845.0, + "reward": 4.378573417663574, + "reward_std": 2.3309240341186523, + "rewards/accuracy_reward/mean": 3.628572940826416, + "rewards/accuracy_reward/std": 3.8240275382995605, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 522.0, + "completions/mean_terminated_length": 522.0, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.29425981873111784, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03351084887981415, + "learning_rate": 2.5853275396955806e-06, + "loss": -0.0011, + "num_tokens": 82117061.0, + "reward": 4.768243789672852, + "reward_std": 1.3814072608947754, + "rewards/accuracy_reward/mean": 4.018243789672852, + "rewards/accuracy_reward/std": 3.7209737300872803, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 557.671875, + "completions/mean_terminated_length": 557.671875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.2948640483383686, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.037500131875276566, + "learning_rate": 2.5833802049192547e-06, + "loss": -0.0048, + "num_tokens": 82288912.0, + "reward": 5.672842502593994, + "reward_std": 1.4818880558013916, + "rewards/accuracy_reward/mean": 4.922842979431152, + "rewards/accuracy_reward/std": 3.6435811519622803, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1186.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 535.09375, + "completions/mean_terminated_length": 535.09375, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.29546827794561936, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029355745762586594, + "learning_rate": 2.5814291423386417e-06, + "loss": 0.0084, + "num_tokens": 82558102.0, + "reward": 3.9773685932159424, + "reward_std": 1.1878936290740967, + "rewards/accuracy_reward/mean": 3.2273685932159424, + "rewards/accuracy_reward/std": 3.6660099029541016, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1604.0, + "completions/max_terminated_length": 1604.0, + "completions/mean_length": 573.6875, + "completions/mean_terminated_length": 573.6875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.29607250755287007, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029182635247707367, + "learning_rate": 2.5794743597460402e-06, + "loss": -0.0054, + "num_tokens": 82708530.0, + "reward": 3.937267541885376, + "reward_std": 1.3795429468154907, + "rewards/accuracy_reward/mean": 3.187267541885376, + "rewards/accuracy_reward/std": 3.615257501602173, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 597.34375, + "completions/mean_terminated_length": 574.3175048828125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.29667673716012083, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024553751572966576, + "learning_rate": 2.5775158649486102e-06, + "loss": -0.0052, + "num_tokens": 82877656.0, + "reward": 4.440078258514404, + "reward_std": 1.1119379997253418, + "rewards/accuracy_reward/mean": 3.701796770095825, + "rewards/accuracy_reward/std": 3.798046350479126, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 586.453125, + "completions/mean_terminated_length": 563.2540283203125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.2972809667673716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05482201650738716, + "learning_rate": 2.5755536657683354e-06, + "loss": -0.0534, + "num_tokens": 83114613.0, + "reward": 3.636718988418579, + "reward_std": 2.853696823120117, + "rewards/accuracy_reward/mean": 2.8984375, + "rewards/accuracy_reward/std": 3.6851582527160645, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 622.8125, + "completions/mean_terminated_length": 622.8125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.29788519637462235, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04489274322986603, + "learning_rate": 2.5735877700419947e-06, + "loss": -0.0114, + "num_tokens": 83277065.0, + "reward": 4.52757453918457, + "reward_std": 1.874708890914917, + "rewards/accuracy_reward/mean": 3.7775747776031494, + "rewards/accuracy_reward/std": 3.6884665489196777, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 646.546875, + "completions/mean_terminated_length": 577.6229248046875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.2984894259818731, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.040818266570568085, + "learning_rate": 2.571618185621131e-06, + "loss": -0.0396, + "num_tokens": 83429516.0, + "reward": 1.1580488681793213, + "reward_std": 1.3204646110534668, + "rewards/accuracy_reward/mean": 0.44320517778396606, + "rewards/accuracy_reward/std": 1.8258779048919678, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 558.640625, + "completions/mean_terminated_length": 535.0000610351562, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.2990936555891239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04195362329483032, + "learning_rate": 2.5696449203720192e-06, + "loss": -0.014, + "num_tokens": 83590389.0, + "reward": 7.366582870483398, + "reward_std": 1.0837984085083008, + "rewards/accuracy_reward/mean": 6.632207870483398, + "rewards/accuracy_reward/std": 2.42379093170166, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 562.25, + "completions/mean_terminated_length": 562.25, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.29969788519637464, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03434981778264046, + "learning_rate": 2.567667982175635e-06, + "loss": 0.009, + "num_tokens": 83738133.0, + "reward": 4.530439853668213, + "reward_std": 1.4511586427688599, + "rewards/accuracy_reward/mean": 3.780439853668213, + "rewards/accuracy_reward/std": 3.63516902923584, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 578.953125, + "completions/mean_terminated_length": 578.953125, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.3003021148036254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04272037371993065, + "learning_rate": 2.5656873789276226e-06, + "loss": 0.004, + "num_tokens": 83902146.0, + "reward": 6.114291191101074, + "reward_std": 1.895585298538208, + "rewards/accuracy_reward/mean": 5.364291191101074, + "rewards/accuracy_reward/std": 3.388500690460205, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 618.34375, + "completions/mean_terminated_length": 618.34375, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.30090634441087616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04965976998209953, + "learning_rate": 2.563703118538266e-06, + "loss": 0.0054, + "num_tokens": 84054584.0, + "reward": 3.362211227416992, + "reward_std": 2.7575767040252686, + "rewards/accuracy_reward/mean": 2.612211227416992, + "rewards/accuracy_reward/std": 3.5410122871398926, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 497.578125, + "completions/mean_terminated_length": 497.578125, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.30151057401812686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036654990166425705, + "learning_rate": 2.5617152089324533e-06, + "loss": 0.017, + "num_tokens": 84256973.0, + "reward": 7.480093002319336, + "reward_std": 1.3644665479660034, + "rewards/accuracy_reward/mean": 6.730093479156494, + "rewards/accuracy_reward/std": 2.1821210384368896, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 612.234375, + "completions/mean_terminated_length": 612.234375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.3021148036253776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03041856922209263, + "learning_rate": 2.559723658049648e-06, + "loss": 0.0068, + "num_tokens": 84436124.0, + "reward": 3.068924903869629, + "reward_std": 1.6953539848327637, + "rewards/accuracy_reward/mean": 2.318924903869629, + "rewards/accuracy_reward/std": 3.3659493923187256, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 627.625, + "completions/mean_terminated_length": 627.625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.3027190332326284, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04851679503917694, + "learning_rate": 2.557728473843856e-06, + "loss": 0.0379, + "num_tokens": 84701076.0, + "reward": 3.055959701538086, + "reward_std": 1.901818871498108, + "rewards/accuracy_reward/mean": 2.305959701538086, + "rewards/accuracy_reward/std": 3.524301528930664, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 549.703125, + "completions/mean_terminated_length": 549.703125, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.30332326283987915, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03179279342293739, + "learning_rate": 2.555729664283595e-06, + "loss": 0.0101, + "num_tokens": 84860577.0, + "reward": 3.5359244346618652, + "reward_std": 0.9699887037277222, + "rewards/accuracy_reward/mean": 2.7859244346618652, + "rewards/accuracy_reward/std": 3.538402795791626, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 589.953125, + "completions/mean_terminated_length": 589.953125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.3039274924471299, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03704957291483879, + "learning_rate": 2.553727237351861e-06, + "loss": 0.0188, + "num_tokens": 85085294.0, + "reward": 4.120010852813721, + "reward_std": 2.0024430751800537, + "rewards/accuracy_reward/mean": 3.3700110912323, + "rewards/accuracy_reward/std": 3.7652597427368164, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 520.5625, + "completions/mean_terminated_length": 520.5625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.30453172205438067, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.013698183000087738, + "learning_rate": 2.551721201046098e-06, + "loss": 0.0027, + "num_tokens": 85257874.0, + "reward": 0.8504781126976013, + "reward_std": 0.5269124507904053, + "rewards/accuracy_reward/mean": 0.10047812759876251, + "rewards/accuracy_reward/std": 0.9391628503799438, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1208.0, + "completions/max_terminated_length": 1208.0, + "completions/mean_length": 646.03125, + "completions/mean_terminated_length": 646.03125, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.30513595166163143, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.013482550159096718, + "learning_rate": 2.5497115633781655e-06, + "loss": 0.0071, + "num_tokens": 85478580.0, + "reward": 4.353405952453613, + "reward_std": 0.47242236137390137, + "rewards/accuracy_reward/mean": 3.603405475616455, + "rewards/accuracy_reward/std": 3.7181553840637207, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 696.6875, + "completions/mean_terminated_length": 696.6875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.3057401812688822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03972203657031059, + "learning_rate": 2.547698332374305e-06, + "loss": 0.0336, + "num_tokens": 85645488.0, + "reward": 3.5838546752929688, + "reward_std": 1.7399206161499023, + "rewards/accuracy_reward/mean": 2.8377609252929688, + "rewards/accuracy_reward/std": 3.6274983882904053, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1377.0, + "completions/max_terminated_length": 1377.0, + "completions/mean_length": 777.59375, + "completions/mean_terminated_length": 777.59375, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.30634441087613296, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028388697654008865, + "learning_rate": 2.5456815160751114e-06, + "loss": 0.0225, + "num_tokens": 85818470.0, + "reward": 3.4792938232421875, + "reward_std": 0.977197527885437, + "rewards/accuracy_reward/mean": 2.7332000732421875, + "rewards/accuracy_reward/std": 3.578862428665161, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 531.21875, + "completions/mean_terminated_length": 531.21875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.30694864048338366, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.027112247422337532, + "learning_rate": 2.5436611225354977e-06, + "loss": 0.0124, + "num_tokens": 85977188.0, + "reward": 1.7956743240356445, + "reward_std": 0.9525031447410583, + "rewards/accuracy_reward/mean": 1.045674443244934, + "rewards/accuracy_reward/std": 2.605496406555176, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1030.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 582.0, + "completions/mean_terminated_length": 582.0, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.3075528700906344, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01158885471522808, + "learning_rate": 2.5416371598246634e-06, + "loss": -0.0046, + "num_tokens": 86119860.0, + "reward": 2.540435552597046, + "reward_std": 0.5497674942016602, + "rewards/accuracy_reward/mean": 1.790435552597046, + "rewards/accuracy_reward/std": 3.1479477882385254, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1375.0, + "completions/max_terminated_length": 1375.0, + "completions/mean_length": 689.734375, + "completions/mean_terminated_length": 689.734375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.3081570996978852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049008578062057495, + "learning_rate": 2.539609636026064e-06, + "loss": 0.0451, + "num_tokens": 86313667.0, + "reward": 4.394654750823975, + "reward_std": 2.276461362838745, + "rewards/accuracy_reward/mean": 3.6446547508239746, + "rewards/accuracy_reward/std": 3.724984884262085, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 551.421875, + "completions/mean_terminated_length": 551.421875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.30876132930513595, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04027511924505234, + "learning_rate": 2.5375785592373775e-06, + "loss": -0.0063, + "num_tokens": 86467454.0, + "reward": 3.1604623794555664, + "reward_std": 2.001044273376465, + "rewards/accuracy_reward/mean": 2.4104623794555664, + "rewards/accuracy_reward/std": 3.789459705352783, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1340.0, + "completions/max_terminated_length": 1340.0, + "completions/mean_length": 610.171875, + "completions/mean_terminated_length": 610.171875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.3093655589123867, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02406211756169796, + "learning_rate": 2.5355439375704705e-06, + "loss": -0.0084, + "num_tokens": 86636969.0, + "reward": 2.9290359020233154, + "reward_std": 0.8371422290802002, + "rewards/accuracy_reward/mean": 2.1790359020233154, + "rewards/accuracy_reward/std": 3.4533982276916504, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 535.828125, + "completions/mean_terminated_length": 535.828125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.30996978851963747, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.001324254204519093, + "learning_rate": 2.5335057791513693e-06, + "loss": -0.0007, + "num_tokens": 86776014.0, + "reward": 2.608144521713257, + "reward_std": 0.047147080302238464, + "rewards/accuracy_reward/mean": 1.8581445217132568, + "rewards/accuracy_reward/std": 3.226409673690796, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 664.875, + "completions/mean_terminated_length": 596.8524169921875, + "completions/min_length": 411.0, + "completions/min_terminated_length": 411.0, + "epoch": 0.31057401812688823, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03275952860713005, + "learning_rate": 2.531464092120225e-06, + "loss": -0.0476, + "num_tokens": 86885190.0, + "reward": 0.8680546879768372, + "reward_std": 0.9936413168907166, + "rewards/accuracy_reward/mean": 0.15321093797683716, + "rewards/accuracy_reward/std": 1.3398962020874023, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 477.84375, + "completions/mean_terminated_length": 477.84375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.311178247734139, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029543830081820488, + "learning_rate": 2.52941888463128e-06, + "loss": 0.0178, + "num_tokens": 87024860.0, + "reward": 5.498080253601074, + "reward_std": 0.9759992957115173, + "rewards/accuracy_reward/mean": 4.748080253601074, + "rewards/accuracy_reward/std": 3.5845179557800293, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 571.0, + "completions/mean_terminated_length": 571.0, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.31178247734138975, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026302644982933998, + "learning_rate": 2.5273701648528393e-06, + "loss": 0.0197, + "num_tokens": 87163228.0, + "reward": 4.8976545333862305, + "reward_std": 1.288057565689087, + "rewards/accuracy_reward/mean": 4.1476545333862305, + "rewards/accuracy_reward/std": 3.687208890914917, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 557.90625, + "completions/mean_terminated_length": 534.2540283203125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.31238670694864046, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03801516070961952, + "learning_rate": 2.525317940967235e-06, + "loss": -0.0182, + "num_tokens": 87306086.0, + "reward": 1.6542092561721802, + "reward_std": 1.6758121252059937, + "rewards/accuracy_reward/mean": 0.9159281253814697, + "rewards/accuracy_reward/std": 2.4932515621185303, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1305.0, + "completions/max_terminated_length": 1305.0, + "completions/mean_length": 718.59375, + "completions/mean_terminated_length": 718.59375, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "epoch": 0.3129909365558912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04718652740120888, + "learning_rate": 2.5232622211707933e-06, + "loss": 0.026, + "num_tokens": 87487692.0, + "reward": 3.5497779846191406, + "reward_std": 1.6930100917816162, + "rewards/accuracy_reward/mean": 2.7997782230377197, + "rewards/accuracy_reward/std": 3.6258392333984375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 551.3125, + "completions/mean_terminated_length": 551.3125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.313595166163142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05105172097682953, + "learning_rate": 2.521203013673802e-06, + "loss": -0.0067, + "num_tokens": 87671152.0, + "reward": 6.754430294036865, + "reward_std": 2.2802343368530273, + "rewards/accuracy_reward/mean": 6.004430294036865, + "rewards/accuracy_reward/std": 2.950695037841797, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 535.828125, + "completions/mean_terminated_length": 535.828125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.31419939577039274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03179646655917168, + "learning_rate": 2.5191403267004815e-06, + "loss": 0.0052, + "num_tokens": 87873829.0, + "reward": 5.935892105102539, + "reward_std": 0.8834279179573059, + "rewards/accuracy_reward/mean": 5.185892105102539, + "rewards/accuracy_reward/std": 3.5056185722351074, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 926.0, + "completions/max_terminated_length": 926.0, + "completions/mean_length": 583.921875, + "completions/mean_terminated_length": 583.921875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.3148036253776435, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.046373553574085236, + "learning_rate": 2.517074168488944e-06, + "loss": 0.0233, + "num_tokens": 88049264.0, + "reward": 1.8005640506744385, + "reward_std": 1.686297059059143, + "rewards/accuracy_reward/mean": 1.0505640506744385, + "rewards/accuracy_reward/std": 2.6176164150238037, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 525.578125, + "completions/mean_terminated_length": 525.578125, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.31540785498489426, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03087746910750866, + "learning_rate": 2.5150045472911703e-06, + "loss": 0.0326, + "num_tokens": 88196261.0, + "reward": 3.3936452865600586, + "reward_std": 1.3448193073272705, + "rewards/accuracy_reward/mean": 2.6436452865600586, + "rewards/accuracy_reward/std": 3.4959187507629395, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1223.0, + "completions/max_terminated_length": 1223.0, + "completions/mean_length": 666.59375, + "completions/mean_terminated_length": 666.59375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.316012084592145, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020502189174294472, + "learning_rate": 2.512931471372968e-06, + "loss": 0.0004, + "num_tokens": 88427355.0, + "reward": 3.036820411682129, + "reward_std": 0.7868330478668213, + "rewards/accuracy_reward/mean": 2.286820888519287, + "rewards/accuracy_reward/std": 3.376790761947632, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 534.546875, + "completions/mean_terminated_length": 510.5238342285156, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.3166163141993958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029121769592165947, + "learning_rate": 2.510854949013946e-06, + "loss": 0.0, + "num_tokens": 88558574.0, + "reward": 3.7023651599884033, + "reward_std": 1.2583508491516113, + "rewards/accuracy_reward/mean": 2.9640841484069824, + "rewards/accuracy_reward/std": 3.568842649459839, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 516.390625, + "completions/mean_terminated_length": 516.390625, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.31722054380664655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024653000757098198, + "learning_rate": 2.5087749885074747e-06, + "loss": -0.0132, + "num_tokens": 88729719.0, + "reward": 4.524803161621094, + "reward_std": 1.2103779315948486, + "rewards/accuracy_reward/mean": 3.7748026847839355, + "rewards/accuracy_reward/std": 3.7551305294036865, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 626.515625, + "completions/mean_terminated_length": 626.515625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.31782477341389725, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04805191978812218, + "learning_rate": 2.506691598160657e-06, + "loss": -0.008, + "num_tokens": 88881288.0, + "reward": 2.6985931396484375, + "reward_std": 1.9655182361602783, + "rewards/accuracy_reward/mean": 1.948593258857727, + "rewards/accuracy_reward/std": 3.3415441513061523, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1425.0, + "completions/max_terminated_length": 1425.0, + "completions/mean_length": 661.671875, + "completions/mean_terminated_length": 661.671875, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.318429003021148, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04120175167918205, + "learning_rate": 2.5046047862942956e-06, + "loss": 0.0189, + "num_tokens": 89047731.0, + "reward": 3.9209141731262207, + "reward_std": 1.8763960599899292, + "rewards/accuracy_reward/mean": 3.1709144115448, + "rewards/accuracy_reward/std": 3.681950569152832, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 508.046875, + "completions/mean_terminated_length": 508.046875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.3190332326283988, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02834552899003029, + "learning_rate": 2.5025145612428566e-06, + "loss": -0.0048, + "num_tokens": 89193910.0, + "reward": 4.587278366088867, + "reward_std": 1.1121737957000732, + "rewards/accuracy_reward/mean": 3.837277889251709, + "rewards/accuracy_reward/std": 3.748697519302368, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1252.0, + "completions/mean_length": 682.296875, + "completions/mean_terminated_length": 660.6190795898438, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.31963746223564954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.020637482404708862, + "learning_rate": 2.500420931354438e-06, + "loss": -0.0148, + "num_tokens": 89352985.0, + "reward": 2.8049497604370117, + "reward_std": 1.0517404079437256, + "rewards/accuracy_reward/mean": 2.066668748855591, + "rewards/accuracy_reward/std": 3.3913016319274902, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 662.203125, + "completions/mean_terminated_length": 662.203125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "epoch": 0.3202416918429003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06494144350290298, + "learning_rate": 2.4983239049907378e-06, + "loss": -0.0024, + "num_tokens": 89558278.0, + "reward": 3.810154676437378, + "reward_std": 3.541553497314453, + "rewards/accuracy_reward/mean": 3.060154438018799, + "rewards/accuracy_reward/std": 3.6779253482818604, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 588.28125, + "completions/mean_terminated_length": 588.28125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.32084592145015106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.005245564505457878, + "learning_rate": 2.4962234905270173e-06, + "loss": -0.0046, + "num_tokens": 89718248.0, + "reward": 4.407009124755859, + "reward_std": 0.2702818810939789, + "rewards/accuracy_reward/mean": 3.6570093631744385, + "rewards/accuracy_reward/std": 3.6654748916625977, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 480.8125, + "completions/mean_terminated_length": 480.8125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.3214501510574018, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02411147579550743, + "learning_rate": 2.494119696352071e-06, + "loss": -0.0045, + "num_tokens": 89864988.0, + "reward": 4.005889892578125, + "reward_std": 0.8250706791877747, + "rewards/accuracy_reward/mean": 3.255889654159546, + "rewards/accuracy_reward/std": 3.707119941711426, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1357.0, + "completions/max_terminated_length": 1357.0, + "completions/mean_length": 628.84375, + "completions/mean_terminated_length": 628.84375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.3220543806646526, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03191345930099487, + "learning_rate": 2.492012530868191e-06, + "loss": -0.0087, + "num_tokens": 90056130.0, + "reward": 3.6830086708068848, + "reward_std": 1.516930103302002, + "rewards/accuracy_reward/mean": 2.9330086708068848, + "rewards/accuracy_reward/std": 3.658022165298462, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1128.0, + "completions/max_terminated_length": 1128.0, + "completions/mean_length": 563.328125, + "completions/mean_terminated_length": 563.328125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.32265861027190335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029607553035020828, + "learning_rate": 2.4899020024911325e-06, + "loss": 0.0054, + "num_tokens": 90210055.0, + "reward": 5.761417388916016, + "reward_std": 1.4409503936767578, + "rewards/accuracy_reward/mean": 5.011417388916016, + "rewards/accuracy_reward/std": 3.4695990085601807, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 503.015625, + "completions/mean_terminated_length": 503.015625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.32326283987915405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02949569933116436, + "learning_rate": 2.4877881196500837e-06, + "loss": 0.0101, + "num_tokens": 90363560.0, + "reward": 3.010479211807251, + "reward_std": 1.3915445804595947, + "rewards/accuracy_reward/mean": 2.260479211807251, + "rewards/accuracy_reward/std": 3.4176697731018066, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 463.640625, + "completions/mean_terminated_length": 463.640625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.3238670694864048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025498291477560997, + "learning_rate": 2.485670890787629e-06, + "loss": 0.0004, + "num_tokens": 90551505.0, + "reward": 4.506355285644531, + "reward_std": 1.2198867797851562, + "rewards/accuracy_reward/mean": 3.7563555240631104, + "rewards/accuracy_reward/std": 3.5658669471740723, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 528.265625, + "completions/mean_terminated_length": 528.265625, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.3244712990936556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04671604186296463, + "learning_rate": 2.4835503243597184e-06, + "loss": 0.0286, + "num_tokens": 90708210.0, + "reward": 4.46537971496582, + "reward_std": 2.2433621883392334, + "rewards/accuracy_reward/mean": 3.7153801918029785, + "rewards/accuracy_reward/std": 3.629692316055298, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 669.25, + "completions/mean_terminated_length": 669.25, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "epoch": 0.32507552870090634, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03908224031329155, + "learning_rate": 2.4814264288356283e-06, + "loss": -0.0199, + "num_tokens": 90872930.0, + "reward": 4.901645183563232, + "reward_std": 2.051635265350342, + "rewards/accuracy_reward/mean": 4.151645183563232, + "rewards/accuracy_reward/std": 3.6601383686065674, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 694.296875, + "completions/mean_terminated_length": 694.296875, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "epoch": 0.3256797583081571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04716293513774872, + "learning_rate": 2.4792992126979334e-06, + "loss": 0.0322, + "num_tokens": 91055877.0, + "reward": 1.9491360187530518, + "reward_std": 2.3858542442321777, + "rewards/accuracy_reward/mean": 1.1991358995437622, + "rewards/accuracy_reward/std": 2.8779895305633545, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 885.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 585.625, + "completions/mean_terminated_length": 585.625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.32628398791540786, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04102923348546028, + "learning_rate": 2.47716868444247e-06, + "loss": -0.0028, + "num_tokens": 91298157.0, + "reward": 3.3344578742980957, + "reward_std": 1.6628526449203491, + "rewards/accuracy_reward/mean": 2.5844578742980957, + "rewards/accuracy_reward/std": 3.5565805435180664, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1813.0, + "completions/max_terminated_length": 1813.0, + "completions/mean_length": 643.65625, + "completions/mean_terminated_length": 643.65625, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.3268882175226586, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.005207693669945002, + "learning_rate": 2.4750348525783035e-06, + "loss": -0.0016, + "num_tokens": 91534807.0, + "reward": 0.7307734489440918, + "reward_std": 0.22968539595603943, + "rewards/accuracy_reward/mean": -0.0192265622317791, + "rewards/accuracy_reward/std": 0.28902220726013184, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 622.953125, + "completions/mean_terminated_length": 622.953125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.3274924471299094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03483951464295387, + "learning_rate": 2.472897725627691e-06, + "loss": -0.0081, + "num_tokens": 91706724.0, + "reward": 5.251262664794922, + "reward_std": 1.6248722076416016, + "rewards/accuracy_reward/mean": 4.501262664794922, + "rewards/accuracy_reward/std": 3.8516979217529297, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 634.609375, + "completions/mean_terminated_length": 634.609375, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.32809667673716014, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02469831518828869, + "learning_rate": 2.470757312126052e-06, + "loss": 0.0136, + "num_tokens": 91860587.0, + "reward": 2.4553961753845215, + "reward_std": 0.6898528933525085, + "rewards/accuracy_reward/mean": 1.705396056175232, + "rewards/accuracy_reward/std": 3.0627660751342773, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 550.09375, + "completions/mean_terminated_length": 550.09375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.3287009063444109, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.025791440159082413, + "learning_rate": 2.4686136206219325e-06, + "loss": -0.0054, + "num_tokens": 92020705.0, + "reward": 2.641148328781128, + "reward_std": 1.2219712734222412, + "rewards/accuracy_reward/mean": 1.8911484479904175, + "rewards/accuracy_reward/std": 3.3582777976989746, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 551.265625, + "completions/mean_terminated_length": 551.265625, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.3293051359516616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048963043838739395, + "learning_rate": 2.4664666596769677e-06, + "loss": -0.0034, + "num_tokens": 92161186.0, + "reward": 2.6239254474639893, + "reward_std": 2.785184860229492, + "rewards/accuracy_reward/mean": 1.8739254474639893, + "rewards/accuracy_reward/std": 3.531790018081665, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 467.0625, + "completions/mean_terminated_length": 467.0625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.32990936555891237, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01917732134461403, + "learning_rate": 2.4643164378658537e-06, + "loss": -0.0059, + "num_tokens": 92332854.0, + "reward": 6.062897682189941, + "reward_std": 0.7344235181808472, + "rewards/accuracy_reward/mean": 5.312897682189941, + "rewards/accuracy_reward/std": 3.4243385791778564, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 509.328125, + "completions/mean_terminated_length": 509.328125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.33051359516616313, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033634111285209656, + "learning_rate": 2.4621629637763073e-06, + "loss": 0.0094, + "num_tokens": 92487115.0, + "reward": 4.7351179122924805, + "reward_std": 1.353628396987915, + "rewards/accuracy_reward/mean": 3.9890239238739014, + "rewards/accuracy_reward/std": 3.6956331729888916, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 601.890625, + "completions/mean_terminated_length": 578.9365234375, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.3311178247734139, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03740015998482704, + "learning_rate": 2.4600062460090367e-06, + "loss": -0.0298, + "num_tokens": 92666020.0, + "reward": 4.229815483093262, + "reward_std": 1.5857090950012207, + "rewards/accuracy_reward/mean": 3.49153470993042, + "rewards/accuracy_reward/std": 3.743115186691284, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1487.0, + "completions/mean_length": 724.359375, + "completions/mean_terminated_length": 703.3492431640625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.33172205438066465, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.021663019433617592, + "learning_rate": 2.457846293177704e-06, + "loss": -0.0556, + "num_tokens": 92857627.0, + "reward": 2.6924469470977783, + "reward_std": 1.1976463794708252, + "rewards/accuracy_reward/mean": 1.9541655778884888, + "rewards/accuracy_reward/std": 3.2817721366882324, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 478.640625, + "completions/mean_terminated_length": 478.640625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.3323262839879154, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03711553290486336, + "learning_rate": 2.4556831139088906e-06, + "loss": 0.0022, + "num_tokens": 93003428.0, + "reward": 3.8501312732696533, + "reward_std": 1.5130878686904907, + "rewards/accuracy_reward/mean": 3.1079437732696533, + "rewards/accuracy_reward/std": 3.5679001808166504, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.0625, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 548.875, + "completions/mean_terminated_length": 548.875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.3329305135951662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04271203652024269, + "learning_rate": 2.453516716842067e-06, + "loss": 0.0162, + "num_tokens": 93138588.0, + "reward": 6.059505939483643, + "reward_std": 2.4866127967834473, + "rewards/accuracy_reward/mean": 5.309506416320801, + "rewards/accuracy_reward/std": 3.240138292312622, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 557.3125, + "completions/mean_terminated_length": 557.3125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.33353474320241694, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.015017522498965263, + "learning_rate": 2.4513471106295523e-06, + "loss": 0.0012, + "num_tokens": 93331040.0, + "reward": 0.8670140504837036, + "reward_std": 0.46805626153945923, + "rewards/accuracy_reward/mean": 0.11701406538486481, + "rewards/accuracy_reward/std": 0.9361125826835632, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 433.390625, + "completions/mean_terminated_length": 433.390625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.3341389728096677, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01990031637251377, + "learning_rate": 2.4491743039364833e-06, + "loss": -0.0034, + "num_tokens": 93470345.0, + "reward": 4.289665699005127, + "reward_std": 0.4655968248844147, + "rewards/accuracy_reward/mean": 3.539665460586548, + "rewards/accuracy_reward/std": 3.681006908416748, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1396.0, + "completions/max_terminated_length": 1396.0, + "completions/mean_length": 701.25, + "completions/mean_terminated_length": 701.25, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.3347432024169184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03971569240093231, + "learning_rate": 2.4469983054407796e-06, + "loss": 0.0255, + "num_tokens": 93666201.0, + "reward": 3.156507730484009, + "reward_std": 1.124951958656311, + "rewards/accuracy_reward/mean": 2.406507968902588, + "rewards/accuracy_reward/std": 3.699349880218506, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 659.0, + "completions/max_terminated_length": 659.0, + "completions/mean_length": 401.84375, + "completions/mean_terminated_length": 401.84375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.33534743202416917, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04125392809510231, + "learning_rate": 2.444819123833108e-06, + "loss": 0.0038, + "num_tokens": 93813023.0, + "reward": 5.170869827270508, + "reward_std": 2.3931639194488525, + "rewards/accuracy_reward/mean": 4.420869827270508, + "rewards/accuracy_reward/std": 3.6855781078338623, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 704.125, + "completions/mean_terminated_length": 682.793701171875, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.33595166163141993, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04724084958434105, + "learning_rate": 2.4426367678168487e-06, + "loss": 0.0058, + "num_tokens": 94009767.0, + "reward": 3.5908031463623047, + "reward_std": 2.223217010498047, + "rewards/accuracy_reward/mean": 2.8525218963623047, + "rewards/accuracy_reward/std": 3.597660779953003, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 591.0625, + "completions/mean_terminated_length": 591.0625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.3365558912386707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05310635268688202, + "learning_rate": 2.4404512461080595e-06, + "loss": 0.0168, + "num_tokens": 94161899.0, + "reward": 7.3017401695251465, + "reward_std": 1.6050300598144531, + "rewards/accuracy_reward/mean": 6.5517401695251465, + "rewards/accuracy_reward/std": 2.4207024574279785, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 671.984375, + "completions/mean_terminated_length": 671.984375, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "epoch": 0.33716012084592145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03643075004220009, + "learning_rate": 2.438262567435442e-06, + "loss": 0.0255, + "num_tokens": 94312778.0, + "reward": 7.280104160308838, + "reward_std": 1.4862345457077026, + "rewards/accuracy_reward/mean": 6.53010368347168, + "rewards/accuracy_reward/std": 2.493168354034424, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 500.484375, + "completions/mean_terminated_length": 500.484375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.3377643504531722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030779782682657242, + "learning_rate": 2.4360707405403062e-06, + "loss": -0.0191, + "num_tokens": 94453817.0, + "reward": 5.607453346252441, + "reward_std": 1.41581130027771, + "rewards/accuracy_reward/mean": 4.857452869415283, + "rewards/accuracy_reward/std": 3.4316701889038086, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 425.796875, + "completions/mean_terminated_length": 425.796875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.338368580060423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053527574986219406, + "learning_rate": 2.4338757741765366e-06, + "loss": 0.0045, + "num_tokens": 94596556.0, + "reward": 4.77479362487793, + "reward_std": 3.4114034175872803, + "rewards/accuracy_reward/mean": 4.02479362487793, + "rewards/accuracy_reward/std": 3.6931724548339844, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 606.828125, + "completions/mean_terminated_length": 606.828125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.33897280966767374, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.031286269426345825, + "learning_rate": 2.4316776771105536e-06, + "loss": 0.0031, + "num_tokens": 94744801.0, + "reward": 2.611846923828125, + "reward_std": 1.6566126346588135, + "rewards/accuracy_reward/mean": 1.861846923828125, + "rewards/accuracy_reward/std": 3.548021078109741, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 580.703125, + "completions/mean_terminated_length": 580.703125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.3395770392749245, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03633186221122742, + "learning_rate": 2.4294764581212847e-06, + "loss": 0.0023, + "num_tokens": 94918014.0, + "reward": 3.998330593109131, + "reward_std": 1.213173747062683, + "rewards/accuracy_reward/mean": 3.24833083152771, + "rewards/accuracy_reward/std": 3.712461471557617, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 477.671875, + "completions/mean_terminated_length": 477.671875, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.3401812688821752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02985767088830471, + "learning_rate": 2.427272126000124e-06, + "loss": 0.0072, + "num_tokens": 95085993.0, + "reward": 7.68345832824707, + "reward_std": 1.2243050336837769, + "rewards/accuracy_reward/mean": 6.9334588050842285, + "rewards/accuracy_reward/std": 1.8079814910888672, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 557.78125, + "completions/mean_terminated_length": 557.78125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.34078549848942596, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04043079540133476, + "learning_rate": 2.4250646895508992e-06, + "loss": 0.0301, + "num_tokens": 95229227.0, + "reward": 4.369527816772461, + "reward_std": 2.038010597229004, + "rewards/accuracy_reward/mean": 3.61952805519104, + "rewards/accuracy_reward/std": 3.740083694458008, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 575.640625, + "completions/mean_terminated_length": 552.2698974609375, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.3413897280966767, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04135991632938385, + "learning_rate": 2.4228541575898362e-06, + "loss": -0.0708, + "num_tokens": 95359316.0, + "reward": 6.105077743530273, + "reward_std": 2.340317487716675, + "rewards/accuracy_reward/mean": 5.366796493530273, + "rewards/accuracy_reward/std": 3.376861095428467, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 581.015625, + "completions/mean_terminated_length": 581.015625, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.3419939577039275, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0675433874130249, + "learning_rate": 2.4206405389455256e-06, + "loss": -0.0174, + "num_tokens": 95632341.0, + "reward": 4.891188621520996, + "reward_std": 2.6678926944732666, + "rewards/accuracy_reward/mean": 4.141189098358154, + "rewards/accuracy_reward/std": 3.8975374698638916, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 520.375, + "completions/mean_terminated_length": 520.375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.34259818731117825, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033063605427742004, + "learning_rate": 2.418423842458884e-06, + "loss": -0.0199, + "num_tokens": 95770205.0, + "reward": 5.380177021026611, + "reward_std": 0.9891994595527649, + "rewards/accuracy_reward/mean": 4.630177021026611, + "rewards/accuracy_reward/std": 3.6153481006622314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 627.84375, + "completions/mean_terminated_length": 582.0322265625, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.343202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027961302548646927, + "learning_rate": 2.41620407698312e-06, + "loss": -0.0139, + "num_tokens": 95903507.0, + "reward": 2.720022201538086, + "reward_std": 1.329418659210205, + "rewards/accuracy_reward/mean": 1.9934598207473755, + "rewards/accuracy_reward/std": 3.2360284328460693, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1144.0, + "completions/max_terminated_length": 1144.0, + "completions/mean_length": 597.015625, + "completions/mean_terminated_length": 597.015625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.34380664652567977, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04822579026222229, + "learning_rate": 2.4139812513837016e-06, + "loss": 0.0133, + "num_tokens": 96121476.0, + "reward": 4.3747358322143555, + "reward_std": 2.163541555404663, + "rewards/accuracy_reward/mean": 3.6247358322143555, + "rewards/accuracy_reward/std": 3.820134401321411, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 536.640625, + "completions/mean_terminated_length": 536.640625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.34441087613293053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023145178332924843, + "learning_rate": 2.411755374538317e-06, + "loss": 0.0011, + "num_tokens": 96274301.0, + "reward": 2.4892072677612305, + "reward_std": 1.1825989484786987, + "rewards/accuracy_reward/mean": 1.739207148551941, + "rewards/accuracy_reward/std": 3.1865384578704834, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 772.75, + "completions/mean_terminated_length": 772.75, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.3450151057401813, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04158008098602295, + "learning_rate": 2.409526455336841e-06, + "loss": -0.0216, + "num_tokens": 96467341.0, + "reward": 3.758376359939575, + "reward_std": 1.8710322380065918, + "rewards/accuracy_reward/mean": 3.008376359939575, + "rewards/accuracy_reward/std": 3.699916124343872, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 591.125, + "completions/mean_terminated_length": 591.125, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.345619335347432, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04563106223940849, + "learning_rate": 2.4072945026813008e-06, + "loss": 0.0034, + "num_tokens": 96638645.0, + "reward": 2.987800121307373, + "reward_std": 2.191483974456787, + "rewards/accuracy_reward/mean": 2.237800121307373, + "rewards/accuracy_reward/std": 3.422420024871826, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1369.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 681.953125, + "completions/mean_terminated_length": 681.953125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.34622356495468276, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02722243405878544, + "learning_rate": 2.405059525485835e-06, + "loss": 0.0028, + "num_tokens": 96815986.0, + "reward": 4.638326644897461, + "reward_std": 0.6692591905593872, + "rewards/accuracy_reward/mean": 3.888326644897461, + "rewards/accuracy_reward/std": 3.7487642765045166, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1142.0, + "completions/max_terminated_length": 1142.0, + "completions/mean_length": 497.65625, + "completions/mean_terminated_length": 497.65625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.3468277945619335, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03924408182501793, + "learning_rate": 2.4028215326766657e-06, + "loss": 0.0036, + "num_tokens": 96970012.0, + "reward": 2.070704460144043, + "reward_std": 1.4322407245635986, + "rewards/accuracy_reward/mean": 1.320704698562622, + "rewards/accuracy_reward/std": 2.9424078464508057, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 590.8125, + "completions/mean_terminated_length": 590.8125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.3474320241691843, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030834142118692398, + "learning_rate": 2.400580533192056e-06, + "loss": -0.0058, + "num_tokens": 97121712.0, + "reward": 3.2941858768463135, + "reward_std": 1.5511980056762695, + "rewards/accuracy_reward/mean": 2.5441858768463135, + "rewards/accuracy_reward/std": 3.58471417427063, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 614.890625, + "completions/mean_terminated_length": 614.890625, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.34803625377643505, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029262850061058998, + "learning_rate": 2.3983365359822804e-06, + "loss": -0.0005, + "num_tokens": 97310921.0, + "reward": 6.168450355529785, + "reward_std": 1.537750244140625, + "rewards/accuracy_reward/mean": 5.418450355529785, + "rewards/accuracy_reward/std": 3.397671937942505, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 519.265625, + "completions/mean_terminated_length": 519.265625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.3486404833836858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05627370998263359, + "learning_rate": 2.396089550009583e-06, + "loss": 0.0039, + "num_tokens": 97452362.0, + "reward": 3.476137638092041, + "reward_std": 2.966714859008789, + "rewards/accuracy_reward/mean": 2.726137399673462, + "rewards/accuracy_reward/std": 3.5205507278442383, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 529.578125, + "completions/mean_terminated_length": 529.578125, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.34924471299093657, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.037670958787202835, + "learning_rate": 2.393839584248147e-06, + "loss": 0.015, + "num_tokens": 97606223.0, + "reward": 5.280470848083496, + "reward_std": 1.5917279720306396, + "rewards/accuracy_reward/mean": 4.530470371246338, + "rewards/accuracy_reward/std": 3.656095266342163, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 508.53125, + "completions/mean_terminated_length": 508.53125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.34984894259818733, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03344406560063362, + "learning_rate": 2.3915866476840545e-06, + "loss": 0.013, + "num_tokens": 97734865.0, + "reward": 3.8635361194610596, + "reward_std": 1.4079285860061646, + "rewards/accuracy_reward/mean": 3.1135358810424805, + "rewards/accuracy_reward/std": 3.635268449783325, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 562.171875, + "completions/mean_terminated_length": 562.171875, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.3504531722054381, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04410478100180626, + "learning_rate": 2.3893307493152536e-06, + "loss": 0.0094, + "num_tokens": 97872188.0, + "reward": 3.3876702785491943, + "reward_std": 1.7772736549377441, + "rewards/accuracy_reward/mean": 2.6376702785491943, + "rewards/accuracy_reward/std": 3.4681243896484375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 499.25, + "completions/mean_terminated_length": 499.25, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.3510574018126888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04582647979259491, + "learning_rate": 2.3870718981515222e-06, + "loss": -0.0011, + "num_tokens": 98096796.0, + "reward": 5.26539945602417, + "reward_std": 2.8229799270629883, + "rewards/accuracy_reward/mean": 4.51930570602417, + "rewards/accuracy_reward/std": 3.6814353466033936, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 676.8125, + "completions/mean_terminated_length": 676.8125, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.35166163141993956, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05148163065314293, + "learning_rate": 2.38481010321443e-06, + "loss": 0.0009, + "num_tokens": 98260976.0, + "reward": 3.9411404132843018, + "reward_std": 2.4968056678771973, + "rewards/accuracy_reward/mean": 3.191140651702881, + "rewards/accuracy_reward/std": 3.6809816360473633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 512.09375, + "completions/mean_terminated_length": 512.09375, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.3522658610271903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0495695136487484, + "learning_rate": 2.382545373537304e-06, + "loss": 0.0195, + "num_tokens": 98422982.0, + "reward": 5.176606178283691, + "reward_std": 2.6182305812835693, + "rewards/accuracy_reward/mean": 4.426606178283691, + "rewards/accuracy_reward/std": 3.6906285285949707, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 522.703125, + "completions/mean_terminated_length": 522.703125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.3528700906344411, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04231693968176842, + "learning_rate": 2.380277718165193e-06, + "loss": 0.0135, + "num_tokens": 98591299.0, + "reward": 5.7620697021484375, + "reward_std": 1.751463770866394, + "rewards/accuracy_reward/mean": 5.0159759521484375, + "rewards/accuracy_reward/std": 3.3919267654418945, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 581.140625, + "completions/mean_terminated_length": 581.140625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.35347432024169184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05911756679415703, + "learning_rate": 2.3780071461548302e-06, + "loss": -0.0109, + "num_tokens": 98744620.0, + "reward": 3.688624858856201, + "reward_std": 3.0077810287475586, + "rewards/accuracy_reward/mean": 2.938624858856201, + "rewards/accuracy_reward/std": 3.6087090969085693, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 552.34375, + "completions/mean_terminated_length": 528.6032104492188, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.3540785498489426, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04198635369539261, + "learning_rate": 2.3757336665745966e-06, + "loss": -0.0363, + "num_tokens": 98912338.0, + "reward": 5.119095325469971, + "reward_std": 2.1401455402374268, + "rewards/accuracy_reward/mean": 4.380814075469971, + "rewards/accuracy_reward/std": 3.720857858657837, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 579.03125, + "completions/mean_terminated_length": 579.03125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.35468277945619336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060759540647268295, + "learning_rate": 2.373457288504487e-06, + "loss": 0.0012, + "num_tokens": 99089956.0, + "reward": 4.192228317260742, + "reward_std": 3.075587749481201, + "rewards/accuracy_reward/mean": 3.442228317260742, + "rewards/accuracy_reward/std": 3.6972367763519287, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 556.5625, + "completions/mean_terminated_length": 556.5625, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.3552870090634441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04810592532157898, + "learning_rate": 2.3711780210360726e-06, + "loss": 0.0342, + "num_tokens": 99283672.0, + "reward": 4.173217296600342, + "reward_std": 2.360679864883423, + "rewards/accuracy_reward/mean": 3.423217296600342, + "rewards/accuracy_reward/std": 3.8068013191223145, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 602.984375, + "completions/mean_terminated_length": 602.984375, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "epoch": 0.3558912386706949, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02206558920443058, + "learning_rate": 2.368895873272462e-06, + "loss": 0.0094, + "num_tokens": 99448119.0, + "reward": 2.23367977142334, + "reward_std": 0.8370374441146851, + "rewards/accuracy_reward/mean": 1.4836797714233398, + "rewards/accuracy_reward/std": 3.04524827003479, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1713.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 655.28125, + "completions/mean_terminated_length": 655.28125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.3564954682779456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034012503921985626, + "learning_rate": 2.3666108543282716e-06, + "loss": -0.0034, + "num_tokens": 99656169.0, + "reward": 3.000051975250244, + "reward_std": 1.3090198040008545, + "rewards/accuracy_reward/mean": 2.250051975250244, + "rewards/accuracy_reward/std": 3.5080270767211914, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 567.6875, + "completions/mean_terminated_length": 567.6875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.35709969788519635, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04264160990715027, + "learning_rate": 2.36432297332958e-06, + "loss": -0.0073, + "num_tokens": 99801717.0, + "reward": 5.427241325378418, + "reward_std": 2.375474452972412, + "rewards/accuracy_reward/mean": 4.688960075378418, + "rewards/accuracy_reward/std": 3.595393180847168, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.0, + "completions/max_terminated_length": 773.0, + "completions/mean_length": 536.203125, + "completions/mean_terminated_length": 536.203125, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.3577039274924471, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.050478845834732056, + "learning_rate": 2.3620322394139003e-06, + "loss": -0.0226, + "num_tokens": 99917474.0, + "reward": 3.027172088623047, + "reward_std": 2.468444347381592, + "rewards/accuracy_reward/mean": 2.277172088623047, + "rewards/accuracy_reward/std": 3.3999030590057373, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 715.015625, + "completions/mean_terminated_length": 693.857177734375, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "epoch": 0.3583081570996979, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.018919337540864944, + "learning_rate": 2.3597386617301386e-06, + "loss": -0.0064, + "num_tokens": 100092915.0, + "reward": 2.5884594917297363, + "reward_std": 0.7198038101196289, + "rewards/accuracy_reward/mean": 1.8501782417297363, + "rewards/accuracy_reward/std": 3.130091428756714, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 716.234375, + "completions/mean_terminated_length": 695.0952758789062, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.35891238670694864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03530818969011307, + "learning_rate": 2.3574422494385576e-06, + "loss": 0.0092, + "num_tokens": 100261394.0, + "reward": 3.7255172729492188, + "reward_std": 1.588619589805603, + "rewards/accuracy_reward/mean": 2.9911422729492188, + "rewards/accuracy_reward/std": 3.797621488571167, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 1277.0, + "completions/mean_length": 562.09375, + "completions/mean_terminated_length": 562.09375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.3595166163141994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03799809142947197, + "learning_rate": 2.3551430117107428e-06, + "loss": -0.0249, + "num_tokens": 100397912.0, + "reward": 5.368924617767334, + "reward_std": 1.686220645904541, + "rewards/accuracy_reward/mean": 4.618924140930176, + "rewards/accuracy_reward/std": 3.564850330352783, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 552.953125, + "completions/mean_terminated_length": 552.953125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.36012084592145016, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05423113703727722, + "learning_rate": 2.3528409577295626e-06, + "loss": 0.008, + "num_tokens": 100535541.0, + "reward": 4.199625015258789, + "reward_std": 2.7343766689300537, + "rewards/accuracy_reward/mean": 3.461344003677368, + "rewards/accuracy_reward/std": 4.008610248565674, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 542.703125, + "completions/mean_terminated_length": 542.703125, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.3607250755287009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05066938325762749, + "learning_rate": 2.350536096689135e-06, + "loss": 0.0235, + "num_tokens": 100673298.0, + "reward": 4.162407875061035, + "reward_std": 2.6816816329956055, + "rewards/accuracy_reward/mean": 3.412407636642456, + "rewards/accuracy_reward/std": 3.942479133605957, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 625.40625, + "completions/mean_terminated_length": 625.40625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.3613293051359517, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0024659638293087482, + "learning_rate": 2.348228437794786e-06, + "loss": 0.0014, + "num_tokens": 100871852.0, + "reward": 0.826171875, + "reward_std": 0.10899822413921356, + "rewards/accuracy_reward/mean": 0.076171875, + "rewards/accuracy_reward/std": 0.22903487086296082, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 741.0, + "completions/max_terminated_length": 741.0, + "completions/mean_length": 561.34375, + "completions/mean_terminated_length": 561.34375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.3619335347432024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.023167984560132027, + "learning_rate": 2.34591799026302e-06, + "loss": -0.0065, + "num_tokens": 101004834.0, + "reward": 7.853643894195557, + "reward_std": 0.777812659740448, + "rewards/accuracy_reward/mean": 7.103643417358398, + "rewards/accuracy_reward/std": 1.5882084369659424, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 625.203125, + "completions/mean_terminated_length": 625.203125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.36253776435045315, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03684601932764053, + "learning_rate": 2.343604763321476e-06, + "loss": -0.0106, + "num_tokens": 101164623.0, + "reward": 1.7795562744140625, + "reward_std": 1.0811549425125122, + "rewards/accuracy_reward/mean": 1.0295562744140625, + "rewards/accuracy_reward/std": 2.463298797607422, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 477.65625, + "completions/mean_terminated_length": 477.65625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.3631419939577039, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0418093279004097, + "learning_rate": 2.341288766208893e-06, + "loss": -0.0121, + "num_tokens": 101311961.0, + "reward": 4.0944905281066895, + "reward_std": 1.8614161014556885, + "rewards/accuracy_reward/mean": 3.3444905281066895, + "rewards/accuracy_reward/std": 3.7370553016662598, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 581.640625, + "completions/mean_terminated_length": 581.640625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.3637462235649547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03652803599834442, + "learning_rate": 2.338970008175077e-06, + "loss": -0.0019, + "num_tokens": 101488722.0, + "reward": 5.637659072875977, + "reward_std": 1.026613712310791, + "rewards/accuracy_reward/mean": 4.887659072875977, + "rewards/accuracy_reward/std": 3.502800703048706, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 465.3125, + "completions/mean_terminated_length": 465.3125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.36435045317220544, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01834779605269432, + "learning_rate": 2.3366484984808574e-06, + "loss": 0.0013, + "num_tokens": 101631238.0, + "reward": 4.348268508911133, + "reward_std": 0.469217449426651, + "rewards/accuracy_reward/mean": 3.598268747329712, + "rewards/accuracy_reward/std": 3.741929769515991, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 917.0, + "completions/max_terminated_length": 917.0, + "completions/mean_length": 613.953125, + "completions/mean_terminated_length": 613.953125, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.3649546827794562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.062429413199424744, + "learning_rate": 2.334324246398055e-06, + "loss": 0.0025, + "num_tokens": 101803267.0, + "reward": 5.209334373474121, + "reward_std": 3.644486904144287, + "rewards/accuracy_reward/mean": 4.459334373474121, + "rewards/accuracy_reward/std": 3.7738029956817627, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 554.25, + "completions/mean_terminated_length": 554.25, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.36555891238670696, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03263628110289574, + "learning_rate": 2.331997261209444e-06, + "loss": -0.0128, + "num_tokens": 101978099.0, + "reward": 4.321915626525879, + "reward_std": 1.3853849172592163, + "rewards/accuracy_reward/mean": 3.571915626525879, + "rewards/accuracy_reward/std": 3.7150309085845947, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 489.765625, + "completions/mean_terminated_length": 489.765625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.3661631419939577, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03521180897951126, + "learning_rate": 2.3296675522087122e-06, + "loss": 0.0014, + "num_tokens": 102117156.0, + "reward": 3.76415753364563, + "reward_std": 1.4089882373809814, + "rewards/accuracy_reward/mean": 3.01415753364563, + "rewards/accuracy_reward/std": 3.672886848449707, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1054.0, + "completions/mean_length": 606.453125, + "completions/mean_terminated_length": 583.5714721679688, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.3667673716012085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04904481768608093, + "learning_rate": 2.3273351287004286e-06, + "loss": -0.0105, + "num_tokens": 102298705.0, + "reward": 4.168015480041504, + "reward_std": 2.4391446113586426, + "rewards/accuracy_reward/mean": 3.429734468460083, + "rewards/accuracy_reward/std": 3.7142698764801025, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 603.359375, + "completions/mean_terminated_length": 603.359375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.36737160120845924, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03290191665291786, + "learning_rate": 2.325e-06, + "loss": -0.0156, + "num_tokens": 102460824.0, + "reward": 1.573218822479248, + "reward_std": 1.5254120826721191, + "rewards/accuracy_reward/mean": 0.823218822479248, + "rewards/accuracy_reward/std": 2.3452706336975098, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 509.78125, + "completions/mean_terminated_length": 509.78125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.36797583081570995, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0022435912396758795, + "learning_rate": 2.322662175433642e-06, + "loss": -0.001, + "num_tokens": 102616474.0, + "reward": 0.734375, + "reward_std": 0.0625, + "rewards/accuracy_reward/mean": -0.015625, + "rewards/accuracy_reward/std": 0.125, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 525.9375, + "completions/mean_terminated_length": 525.9375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.3685800604229607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033436402678489685, + "learning_rate": 2.320321664338333e-06, + "loss": 0.0049, + "num_tokens": 102799158.0, + "reward": 7.501306056976318, + "reward_std": 1.3641502857208252, + "rewards/accuracy_reward/mean": 6.751306533813477, + "rewards/accuracy_reward/std": 2.1627097129821777, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 496.84375, + "completions/mean_terminated_length": 496.84375, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.36918429003021147, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0257108137011528, + "learning_rate": 2.3179784760617838e-06, + "loss": 0.0002, + "num_tokens": 102963196.0, + "reward": 3.7248687744140625, + "reward_std": 1.0003536939620972, + "rewards/accuracy_reward/mean": 2.9748687744140625, + "rewards/accuracy_reward/std": 3.6593141555786133, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 541.890625, + "completions/mean_terminated_length": 541.890625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.36978851963746223, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024479081854224205, + "learning_rate": 2.3156326199623965e-06, + "loss": -0.0016, + "num_tokens": 103174437.0, + "reward": 2.9342639446258545, + "reward_std": 1.1319355964660645, + "rewards/accuracy_reward/mean": 2.1842639446258545, + "rewards/accuracy_reward/std": 3.4252278804779053, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 657.0625, + "completions/mean_terminated_length": 657.0625, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.370392749244713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027440987527370453, + "learning_rate": 2.3132841054092277e-06, + "loss": -0.008, + "num_tokens": 103360025.0, + "reward": 4.71362829208374, + "reward_std": 1.2076504230499268, + "rewards/accuracy_reward/mean": 3.963627815246582, + "rewards/accuracy_reward/std": 3.700237512588501, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 547.6875, + "completions/mean_terminated_length": 523.873046875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.37099697885196375, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045284442603588104, + "learning_rate": 2.310932941781952e-06, + "loss": -0.0471, + "num_tokens": 103594885.0, + "reward": 4.831441879272461, + "reward_std": 2.6380953788757324, + "rewards/accuracy_reward/mean": 4.093160629272461, + "rewards/accuracy_reward/std": 3.918578863143921, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 533.359375, + "completions/mean_terminated_length": 533.359375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.3716012084592145, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024308834224939346, + "learning_rate": 2.308579138470825e-06, + "loss": -0.0014, + "num_tokens": 103831212.0, + "reward": 5.862656593322754, + "reward_std": 1.2395079135894775, + "rewards/accuracy_reward/mean": 5.112656593322754, + "rewards/accuracy_reward/std": 3.4497828483581543, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 464.625, + "completions/mean_terminated_length": 464.625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.3722054380664653, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03128449246287346, + "learning_rate": 2.3062227048766425e-06, + "loss": 0.0115, + "num_tokens": 104024948.0, + "reward": 6.849605083465576, + "reward_std": 0.9772966504096985, + "rewards/accuracy_reward/mean": 6.099605083465576, + "rewards/accuracy_reward/std": 2.8523411750793457, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 586.421875, + "completions/mean_terminated_length": 586.421875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.37280966767371604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03493410348892212, + "learning_rate": 2.3038636504107086e-06, + "loss": -0.0133, + "num_tokens": 104195519.0, + "reward": 3.717839002609253, + "reward_std": 1.4956833124160767, + "rewards/accuracy_reward/mean": 2.967839002609253, + "rewards/accuracy_reward/std": 3.626225471496582, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 646.0, + "completions/mean_terminated_length": 646.0, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.37341389728096674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036380965262651443, + "learning_rate": 2.3015019844947912e-06, + "loss": 0.002, + "num_tokens": 104435583.0, + "reward": 4.970524787902832, + "reward_std": 1.862847089767456, + "rewards/accuracy_reward/mean": 4.22052526473999, + "rewards/accuracy_reward/std": 3.667523145675659, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 512.203125, + "completions/mean_terminated_length": 512.203125, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.3740181268882175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06358713656663895, + "learning_rate": 2.2991377165610905e-06, + "loss": 0.0142, + "num_tokens": 104594604.0, + "reward": 4.913411617279053, + "reward_std": 3.638385534286499, + "rewards/accuracy_reward/mean": 4.163411617279053, + "rewards/accuracy_reward/std": 3.7564072608947754, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 577.328125, + "completions/mean_terminated_length": 577.328125, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.37462235649546827, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041663553565740585, + "learning_rate": 2.2967708560521996e-06, + "loss": -0.0058, + "num_tokens": 104743665.0, + "reward": 6.350361347198486, + "reward_std": 2.3647027015686035, + "rewards/accuracy_reward/mean": 5.600361347198486, + "rewards/accuracy_reward/std": 3.2414841651916504, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 536.71875, + "completions/mean_terminated_length": 536.71875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.37522658610271903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0581195242702961, + "learning_rate": 2.2944014124210622e-06, + "loss": -0.0129, + "num_tokens": 104922911.0, + "reward": 5.324117660522461, + "reward_std": 3.207876205444336, + "rewards/accuracy_reward/mean": 4.581930160522461, + "rewards/accuracy_reward/std": 3.7145276069641113, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1713.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 595.90625, + "completions/mean_terminated_length": 595.90625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.3758308157099698, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04578837752342224, + "learning_rate": 2.2920293951309427e-06, + "loss": -0.0309, + "num_tokens": 105080681.0, + "reward": 5.379464149475098, + "reward_std": 1.0047770738601685, + "rewards/accuracy_reward/mean": 4.629464149475098, + "rewards/accuracy_reward/std": 3.6015472412109375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 531.828125, + "completions/mean_terminated_length": 507.7619323730469, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.37643504531722055, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02498473785817623, + "learning_rate": 2.2896548136553817e-06, + "loss": -0.0084, + "num_tokens": 105202990.0, + "reward": 5.986804962158203, + "reward_std": 0.8710455894470215, + "rewards/accuracy_reward/mean": 5.260241985321045, + "rewards/accuracy_reward/std": 3.452625036239624, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 618.078125, + "completions/mean_terminated_length": 618.078125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.3770392749244713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.049497514963150024, + "learning_rate": 2.2872776774781627e-06, + "loss": -0.0114, + "num_tokens": 105379251.0, + "reward": 2.46805477142334, + "reward_std": 2.4250648021698, + "rewards/accuracy_reward/mean": 1.7180546522140503, + "rewards/accuracy_reward/std": 3.3530941009521484, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 603.21875, + "completions/mean_terminated_length": 603.21875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.3776435045317221, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05505942180752754, + "learning_rate": 2.284897996093271e-06, + "loss": -0.0091, + "num_tokens": 105532001.0, + "reward": 4.452691078186035, + "reward_std": 2.5179545879364014, + "rewards/accuracy_reward/mean": 3.702690601348877, + "rewards/accuracy_reward/std": 3.785402536392212, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 604.84375, + "completions/mean_terminated_length": 604.84375, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.37824773413897284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05843400955200195, + "learning_rate": 2.282515779004858e-06, + "loss": -0.0058, + "num_tokens": 105680279.0, + "reward": 4.354640960693359, + "reward_std": 3.245469570159912, + "rewards/accuracy_reward/mean": 3.604640483856201, + "rewards/accuracy_reward/std": 3.7485921382904053, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 589.953125, + "completions/mean_terminated_length": 589.953125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.37885196374622354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04277821257710457, + "learning_rate": 2.280131035727202e-06, + "loss": 0.0089, + "num_tokens": 105920196.0, + "reward": 3.402340888977051, + "reward_std": 1.4709904193878174, + "rewards/accuracy_reward/mean": 2.6523404121398926, + "rewards/accuracy_reward/std": 3.5464556217193604, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1380.0, + "completions/max_terminated_length": 1380.0, + "completions/mean_length": 645.984375, + "completions/mean_terminated_length": 645.984375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.3794561933534743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040962450206279755, + "learning_rate": 2.27774377578467e-06, + "loss": -0.0145, + "num_tokens": 106076883.0, + "reward": 4.417929649353027, + "reward_std": 1.576377511024475, + "rewards/accuracy_reward/mean": 3.6718358993530273, + "rewards/accuracy_reward/std": 3.9159021377563477, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 550.046875, + "completions/mean_terminated_length": 550.046875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.38006042296072506, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032439518719911575, + "learning_rate": 2.275354008711682e-06, + "loss": -0.015, + "num_tokens": 106226550.0, + "reward": 5.493226051330566, + "reward_std": 1.9037315845489502, + "rewards/accuracy_reward/mean": 4.743226051330566, + "rewards/accuracy_reward/std": 3.5267536640167236, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 510.25, + "completions/mean_terminated_length": 485.84130859375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.3806646525679758, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04109392315149307, + "learning_rate": 2.272961744052669e-06, + "loss": -0.0249, + "num_tokens": 106397238.0, + "reward": 1.4059468507766724, + "reward_std": 1.586371898651123, + "rewards/accuracy_reward/mean": 0.6676656007766724, + "rewards/accuracy_reward/std": 2.207293748855591, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 593.328125, + "completions/mean_terminated_length": 593.328125, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.3812688821752266, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024659698829054832, + "learning_rate": 2.270566991362039e-06, + "loss": -0.0006, + "num_tokens": 106526411.0, + "reward": 6.059937477111816, + "reward_std": 1.0005804300308228, + "rewards/accuracy_reward/mean": 5.313843727111816, + "rewards/accuracy_reward/std": 3.3883180618286133, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 541.578125, + "completions/mean_terminated_length": 541.578125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.38187311178247735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.028686419129371643, + "learning_rate": 2.2681697602041355e-06, + "loss": 0.0209, + "num_tokens": 106708944.0, + "reward": 6.191097259521484, + "reward_std": 1.206233024597168, + "rewards/accuracy_reward/mean": 5.441097259521484, + "rewards/accuracy_reward/std": 3.2222394943237305, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 932.0, + "completions/mean_length": 560.828125, + "completions/mean_terminated_length": 512.8547973632812, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.3824773413897281, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04997645691037178, + "learning_rate": 2.265770060153201e-06, + "loss": -0.0773, + "num_tokens": 106892517.0, + "reward": 5.092750072479248, + "reward_std": 2.3058688640594482, + "rewards/accuracy_reward/mean": 4.36618709564209, + "rewards/accuracy_reward/std": 3.793440818786621, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 568.78125, + "completions/mean_terminated_length": 568.78125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.38308157099697887, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04408370703458786, + "learning_rate": 2.263367900793339e-06, + "loss": -0.0025, + "num_tokens": 107103671.0, + "reward": 3.7486109733581543, + "reward_std": 0.9350475072860718, + "rewards/accuracy_reward/mean": 2.9986109733581543, + "rewards/accuracy_reward/std": 3.653909921646118, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 581.796875, + "completions/mean_terminated_length": 581.796875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.38368580060422963, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054460953921079636, + "learning_rate": 2.260963291718475e-06, + "loss": 0.0218, + "num_tokens": 107259130.0, + "reward": 5.510498046875, + "reward_std": 2.396334171295166, + "rewards/accuracy_reward/mean": 4.760498046875, + "rewards/accuracy_reward/std": 3.6815202236175537, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 483.796875, + "completions/mean_terminated_length": 483.796875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.38429003021148034, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.021570546552538872, + "learning_rate": 2.258556242532317e-06, + "loss": -0.0135, + "num_tokens": 107395997.0, + "reward": 7.818881034851074, + "reward_std": 0.7703056931495667, + "rewards/accuracy_reward/mean": 7.068881034851074, + "rewards/accuracy_reward/std": 1.5810744762420654, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 696.046875, + "completions/mean_terminated_length": 696.046875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.3848942598187311, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05401880666613579, + "learning_rate": 2.256146762848321e-06, + "loss": -0.004, + "num_tokens": 107559168.0, + "reward": 4.910065650939941, + "reward_std": 1.895164132118225, + "rewards/accuracy_reward/mean": 4.160065650939941, + "rewards/accuracy_reward/std": 3.77001690864563, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 683.703125, + "completions/mean_terminated_length": 683.703125, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.38549848942598186, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04129048064351082, + "learning_rate": 2.253734862289648e-06, + "loss": -0.0155, + "num_tokens": 107670301.0, + "reward": 1.8430062532424927, + "reward_std": 1.062828540802002, + "rewards/accuracy_reward/mean": 1.0969123840332031, + "rewards/accuracy_reward/std": 2.7075860500335693, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 628.328125, + "completions/mean_terminated_length": 628.328125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.3861027190332326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03479877486824989, + "learning_rate": 2.251320550489129e-06, + "loss": 0.0258, + "num_tokens": 107875986.0, + "reward": 4.313976287841797, + "reward_std": 1.4814016819000244, + "rewards/accuracy_reward/mean": 3.567882537841797, + "rewards/accuracy_reward/std": 3.7773375511169434, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 557.8125, + "completions/mean_terminated_length": 557.8125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.3867069486404834, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0350569523870945, + "learning_rate": 2.2489038370892244e-06, + "loss": -0.016, + "num_tokens": 108036214.0, + "reward": 4.094048500061035, + "reward_std": 1.9550466537475586, + "rewards/accuracy_reward/mean": 3.344048500061035, + "rewards/accuracy_reward/std": 3.736623525619507, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 565.953125, + "completions/mean_terminated_length": 565.953125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.38731117824773414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04504424333572388, + "learning_rate": 2.246484731741986e-06, + "loss": -0.001, + "num_tokens": 108214611.0, + "reward": 6.532639026641846, + "reward_std": 1.9630457162857056, + "rewards/accuracy_reward/mean": 5.794357776641846, + "rewards/accuracy_reward/std": 3.170980453491211, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1156.0, + "completions/mean_length": 585.15625, + "completions/mean_terminated_length": 561.9365234375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.3879154078549849, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033980149775743484, + "learning_rate": 2.24406324410902e-06, + "loss": -0.0071, + "num_tokens": 108436829.0, + "reward": 3.865971326828003, + "reward_std": 1.6080200672149658, + "rewards/accuracy_reward/mean": 3.139409065246582, + "rewards/accuracy_reward/std": 3.5967726707458496, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 561.640625, + "completions/mean_terminated_length": 538.0476684570312, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.38851963746223567, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05007398873567581, + "learning_rate": 2.2416393838614457e-06, + "loss": -0.0501, + "num_tokens": 108584566.0, + "reward": 3.3711376190185547, + "reward_std": 2.6669814586639404, + "rewards/accuracy_reward/mean": 2.6328563690185547, + "rewards/accuracy_reward/std": 3.6476082801818848, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 554.9375, + "completions/mean_terminated_length": 554.9375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.38912386706948643, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0463605597615242, + "learning_rate": 2.23921316067986e-06, + "loss": 0.0162, + "num_tokens": 108786610.0, + "reward": 6.26987361907959, + "reward_std": 2.3371100425720215, + "rewards/accuracy_reward/mean": 5.51987361907959, + "rewards/accuracy_reward/std": 3.289806842803955, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 565.515625, + "completions/mean_terminated_length": 541.984130859375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.38972809667673713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05093276873230934, + "learning_rate": 2.2367845842542953e-06, + "loss": -0.0318, + "num_tokens": 108925043.0, + "reward": 2.8208749294281006, + "reward_std": 2.0061538219451904, + "rewards/accuracy_reward/mean": 2.0864999294281006, + "rewards/accuracy_reward/std": 3.491361618041992, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.0, + "completions/max_terminated_length": 731.0, + "completions/mean_length": 451.53125, + "completions/mean_terminated_length": 451.53125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.3903323262839879, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024829436093568802, + "learning_rate": 2.234353664284183e-06, + "loss": 0.0084, + "num_tokens": 109113109.0, + "reward": 6.090208530426025, + "reward_std": 0.6502478122711182, + "rewards/accuracy_reward/mean": 5.340208530426025, + "rewards/accuracy_reward/std": 3.3670778274536133, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 450.296875, + "completions/mean_terminated_length": 450.296875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.39093655589123866, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04200921580195427, + "learning_rate": 2.231920410478316e-06, + "loss": -0.0057, + "num_tokens": 109250248.0, + "reward": 7.201107025146484, + "reward_std": 0.9967567920684814, + "rewards/accuracy_reward/mean": 6.451107025146484, + "rewards/accuracy_reward/std": 2.508700132369995, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1412.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 614.296875, + "completions/mean_terminated_length": 614.296875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.3915407854984894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030943112447857857, + "learning_rate": 2.2294848325548066e-06, + "loss": -0.0256, + "num_tokens": 109398027.0, + "reward": 5.322218894958496, + "reward_std": 1.4429855346679688, + "rewards/accuracy_reward/mean": 4.572218894958496, + "rewards/accuracy_reward/std": 3.4907028675079346, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 568.25, + "completions/mean_terminated_length": 568.25, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.3921450151057402, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02914736419916153, + "learning_rate": 2.227046940241049e-06, + "loss": 0.0298, + "num_tokens": 109536923.0, + "reward": 3.9511733055114746, + "reward_std": 0.956314206123352, + "rewards/accuracy_reward/mean": 3.2011733055114746, + "rewards/accuracy_reward/std": 3.7596166133880615, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 624.28125, + "completions/mean_terminated_length": 624.28125, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.39274924471299094, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0019236913649365306, + "learning_rate": 2.2246067432736813e-06, + "loss": 0.0016, + "num_tokens": 109706013.0, + "reward": 2.6549015045166016, + "reward_std": 0.06076320633292198, + "rewards/accuracy_reward/mean": 1.9049016237258911, + "rewards/accuracy_reward/std": 3.244358777999878, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 530.53125, + "completions/mean_terminated_length": 530.53125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.3933534743202417, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05642035976052284, + "learning_rate": 2.2221642513985473e-06, + "loss": 0.0244, + "num_tokens": 109939407.0, + "reward": 6.604456901550293, + "reward_std": 2.5603504180908203, + "rewards/accuracy_reward/mean": 5.854456901550293, + "rewards/accuracy_reward/std": 3.022066354751587, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 587.328125, + "completions/mean_terminated_length": 587.328125, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.39395770392749246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047218091785907745, + "learning_rate": 2.219719474370655e-06, + "loss": 0.0269, + "num_tokens": 110098132.0, + "reward": 6.9234795570373535, + "reward_std": 2.3610458374023438, + "rewards/accuracy_reward/mean": 6.173480033874512, + "rewards/accuracy_reward/std": 2.8348498344421387, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 539.09375, + "completions/mean_terminated_length": 539.09375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.3945619335347432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030648574233055115, + "learning_rate": 2.217272421954139e-06, + "loss": 0.012, + "num_tokens": 110239706.0, + "reward": 2.5934438705444336, + "reward_std": 1.3787918090820312, + "rewards/accuracy_reward/mean": 1.843443751335144, + "rewards/accuracy_reward/std": 3.279514789581299, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 471.46875, + "completions/mean_terminated_length": 471.46875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.39516616314199393, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.000725586898624897, + "learning_rate": 2.2148231039222224e-06, + "loss": 0.0, + "num_tokens": 110362408.0, + "reward": 2.5743627548217773, + "reward_std": 0.021730881184339523, + "rewards/accuracy_reward/mean": 1.8243625164031982, + "rewards/accuracy_reward/std": 3.1851508617401123, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 430.34375, + "completions/mean_terminated_length": 430.34375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.3957703927492447, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033305078744888306, + "learning_rate": 2.212371530057175e-06, + "loss": -0.0035, + "num_tokens": 110485934.0, + "reward": 5.794437408447266, + "reward_std": 0.8575419187545776, + "rewards/accuracy_reward/mean": 5.044437408447266, + "rewards/accuracy_reward/std": 3.464524984359741, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 518.453125, + "completions/mean_terminated_length": 518.453125, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.39637462235649545, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033474475145339966, + "learning_rate": 2.2099177101502796e-06, + "loss": -0.0024, + "num_tokens": 110640379.0, + "reward": 5.94204044342041, + "reward_std": 1.3198528289794922, + "rewards/accuracy_reward/mean": 5.192041397094727, + "rewards/accuracy_reward/std": 3.401319980621338, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 608.046875, + "completions/mean_terminated_length": 608.046875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.3969788519637462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03556086868047714, + "learning_rate": 2.207461654001786e-06, + "loss": 0.0017, + "num_tokens": 110825342.0, + "reward": 1.5946593284606934, + "reward_std": 1.0898072719573975, + "rewards/accuracy_reward/mean": 0.8446593284606934, + "rewards/accuracy_reward/std": 2.151150703430176, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 885.0, + "completions/mean_length": 645.28125, + "completions/mean_terminated_length": 600.0322265625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.397583081570997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04328981786966324, + "learning_rate": 2.205003371420876e-06, + "loss": -0.0199, + "num_tokens": 110983280.0, + "reward": 2.811678171157837, + "reward_std": 1.9342856407165527, + "rewards/accuracy_reward/mean": 2.085115671157837, + "rewards/accuracy_reward/std": 3.379192352294922, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 626.921875, + "completions/mean_terminated_length": 626.921875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 0.39818731117824774, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020107567310333252, + "learning_rate": 2.202542872225626e-06, + "loss": -0.0052, + "num_tokens": 111111003.0, + "reward": 4.800782680511475, + "reward_std": 0.6178919076919556, + "rewards/accuracy_reward/mean": 4.050783157348633, + "rewards/accuracy_reward/std": 3.6567981243133545, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1324.0, + "completions/max_terminated_length": 1324.0, + "completions/mean_length": 616.609375, + "completions/mean_terminated_length": 616.609375, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "epoch": 0.3987915407854985, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05214482173323631, + "learning_rate": 2.200080166242961e-06, + "loss": 0.0052, + "num_tokens": 111244626.0, + "reward": 4.537371635437012, + "reward_std": 1.9454087018966675, + "rewards/accuracy_reward/mean": 3.787371873855591, + "rewards/accuracy_reward/std": 3.800194025039673, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1124.0, + "completions/max_terminated_length": 1124.0, + "completions/mean_length": 518.65625, + "completions/mean_terminated_length": 518.65625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.39939577039274926, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01699947379529476, + "learning_rate": 2.197615263308624e-06, + "loss": 0.0115, + "num_tokens": 111362156.0, + "reward": 4.320437431335449, + "reward_std": 0.47070014476776123, + "rewards/accuracy_reward/mean": 3.570437431335449, + "rewards/accuracy_reward/std": 3.7133402824401855, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 594.109375, + "completions/mean_terminated_length": 594.109375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04728113114833832, + "learning_rate": 2.1951481732671293e-06, + "loss": 0.0074, + "num_tokens": 111507187.0, + "reward": 6.811520099639893, + "reward_std": 1.8444682359695435, + "rewards/accuracy_reward/mean": 6.061520099639893, + "rewards/accuracy_reward/std": 2.9165842533111572, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 609.078125, + "completions/mean_terminated_length": 609.078125, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.40060422960725073, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03539511188864708, + "learning_rate": 2.192678905971727e-06, + "loss": 0.0028, + "num_tokens": 111666984.0, + "reward": 4.871679306030273, + "reward_std": 1.886969804763794, + "rewards/accuracy_reward/mean": 4.121679306030273, + "rewards/accuracy_reward/std": 3.6822335720062256, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 421.03125, + "completions/mean_terminated_length": 421.03125, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.4012084592145015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041824210435152054, + "learning_rate": 2.1902074712843637e-06, + "loss": 0.002, + "num_tokens": 111866906.0, + "reward": 3.7571206092834473, + "reward_std": 2.3339433670043945, + "rewards/accuracy_reward/mean": 3.007120132446289, + "rewards/accuracy_reward/std": 3.664247989654541, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1257.0, + "completions/max_terminated_length": 1257.0, + "completions/mean_length": 657.703125, + "completions/mean_terminated_length": 657.703125, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.40181268882175225, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040892865508794785, + "learning_rate": 2.1877338790756413e-06, + "loss": 0.0142, + "num_tokens": 112018151.0, + "reward": 5.465047359466553, + "reward_std": 1.9629805088043213, + "rewards/accuracy_reward/mean": 4.715047836303711, + "rewards/accuracy_reward/std": 3.5810062885284424, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1296.0, + "completions/max_terminated_length": 1296.0, + "completions/mean_length": 647.890625, + "completions/mean_terminated_length": 647.890625, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.402416918429003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06024125590920448, + "learning_rate": 2.1852581392247796e-06, + "loss": 0.01, + "num_tokens": 112192336.0, + "reward": 3.2866733074188232, + "reward_std": 2.8192572593688965, + "rewards/accuracy_reward/mean": 2.5366733074188232, + "rewards/accuracy_reward/std": 3.603172540664673, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 578.234375, + "completions/mean_terminated_length": 578.234375, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.4030211480362538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048575013875961304, + "learning_rate": 2.1827802616195753e-06, + "loss": -0.001, + "num_tokens": 112338399.0, + "reward": 5.455204963684082, + "reward_std": 2.767855644226074, + "rewards/accuracy_reward/mean": 4.705204963684082, + "rewards/accuracy_reward/std": 3.6907503604888916, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 523.75, + "completions/mean_terminated_length": 523.75, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.40362537764350453, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04487089067697525, + "learning_rate": 2.180300256156362e-06, + "loss": 0.0268, + "num_tokens": 112472959.0, + "reward": 2.7425765991210938, + "reward_std": 1.5184659957885742, + "rewards/accuracy_reward/mean": 1.9925765991210938, + "rewards/accuracy_reward/std": 3.7159841060638428, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1727.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 765.75, + "completions/mean_terminated_length": 765.75, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.4042296072507553, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006174631416797638, + "learning_rate": 2.1778181327399733e-06, + "loss": -0.0014, + "num_tokens": 112721967.0, + "reward": 0.7595921754837036, + "reward_std": 0.2543012201786041, + "rewards/accuracy_reward/mean": 0.009592186659574509, + "rewards/accuracy_reward/std": 0.3120405077934265, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 548.5625, + "completions/mean_terminated_length": 548.5625, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.40483383685800606, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05368649587035179, + "learning_rate": 2.1753339012837008e-06, + "loss": -0.0049, + "num_tokens": 112837043.0, + "reward": 4.674653053283691, + "reward_std": 2.3230960369110107, + "rewards/accuracy_reward/mean": 3.9285595417022705, + "rewards/accuracy_reward/std": 3.753181219100952, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 467.390625, + "completions/mean_terminated_length": 467.390625, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.4054380664652568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03868391737341881, + "learning_rate": 2.172847571709256e-06, + "loss": 0.0079, + "num_tokens": 113026316.0, + "reward": 4.346445083618164, + "reward_std": 1.9648139476776123, + "rewards/accuracy_reward/mean": 3.596445083618164, + "rewards/accuracy_reward/std": 3.7415881156921387, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 533.78125, + "completions/mean_terminated_length": 533.78125, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.4060422960725076, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02179787866771221, + "learning_rate": 2.1703591539467283e-06, + "loss": 0.0039, + "num_tokens": 113181278.0, + "reward": 6.085538864135742, + "reward_std": 0.638260006904602, + "rewards/accuracy_reward/mean": 5.335538864135742, + "rewards/accuracy_reward/std": 3.364671468734741, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 580.015625, + "completions/mean_terminated_length": 580.015625, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.4066465256797583, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016972694545984268, + "learning_rate": 2.16786865793455e-06, + "loss": 0.005, + "num_tokens": 113367295.0, + "reward": 6.4354658126831055, + "reward_std": 0.48524153232574463, + "rewards/accuracy_reward/mean": 5.6854658126831055, + "rewards/accuracy_reward/std": 3.154125690460205, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 619.8125, + "completions/mean_terminated_length": 619.8125, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.40725075528700905, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.023773489519953728, + "learning_rate": 2.1653760936194505e-06, + "loss": 0.003, + "num_tokens": 113522531.0, + "reward": 4.632719039916992, + "reward_std": 0.7714667320251465, + "rewards/accuracy_reward/mean": 3.882718563079834, + "rewards/accuracy_reward/std": 3.842507839202881, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 530.078125, + "completions/mean_terminated_length": 530.078125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.4078549848942598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05315440893173218, + "learning_rate": 2.162881470956422e-06, + "loss": -0.0012, + "num_tokens": 113680648.0, + "reward": 6.186849594116211, + "reward_std": 1.9272685050964355, + "rewards/accuracy_reward/mean": 5.436850070953369, + "rewards/accuracy_reward/std": 3.296213150024414, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 503.09375, + "completions/mean_terminated_length": 503.09375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.40845921450151057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05478931963443756, + "learning_rate": 2.1603847999086767e-06, + "loss": 0.0165, + "num_tokens": 113920670.0, + "reward": 5.7950592041015625, + "reward_std": 2.9437239170074463, + "rewards/accuracy_reward/mean": 5.0450592041015625, + "rewards/accuracy_reward/std": 3.4411404132843018, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 665.1875, + "completions/mean_terminated_length": 665.1875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.40906344410876133, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03746034577488899, + "learning_rate": 2.1578860904476076e-06, + "loss": -0.0056, + "num_tokens": 114138938.0, + "reward": 3.221266984939575, + "reward_std": 1.639195203781128, + "rewards/accuracy_reward/mean": 2.4712672233581543, + "rewards/accuracy_reward/std": 3.508272886276245, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1665.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 607.703125, + "completions/mean_terminated_length": 607.703125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.4096676737160121, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05031518265604973, + "learning_rate": 2.1553853525527495e-06, + "loss": -0.0088, + "num_tokens": 114264983.0, + "reward": 6.890536785125732, + "reward_std": 1.474787950515747, + "rewards/accuracy_reward/mean": 6.140537261962891, + "rewards/accuracy_reward/std": 2.873811960220337, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 669.171875, + "completions/mean_terminated_length": 647.2857666015625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.41027190332326285, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0345839187502861, + "learning_rate": 2.152882596211738e-06, + "loss": -0.0157, + "num_tokens": 114387730.0, + "reward": 2.783590793609619, + "reward_std": 1.3340587615966797, + "rewards/accuracy_reward/mean": 2.045309543609619, + "rewards/accuracy_reward/std": 3.4072558879852295, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1223.0, + "completions/max_terminated_length": 1223.0, + "completions/mean_length": 671.65625, + "completions/mean_terminated_length": 671.65625, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.4108761329305136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0425165556371212, + "learning_rate": 2.1503778314202693e-06, + "loss": 0.0195, + "num_tokens": 114568188.0, + "reward": 3.7352406978607178, + "reward_std": 1.4962862730026245, + "rewards/accuracy_reward/mean": 2.9852404594421387, + "rewards/accuracy_reward/std": 3.7059760093688965, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 509.46875, + "completions/mean_terminated_length": 509.46875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.4114803625377644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04316689074039459, + "learning_rate": 2.1478710681820633e-06, + "loss": 0.0123, + "num_tokens": 114716698.0, + "reward": 3.7393624782562256, + "reward_std": 2.33754301071167, + "rewards/accuracy_reward/mean": 2.9893624782562256, + "rewards/accuracy_reward/std": 3.6428115367889404, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 555.03125, + "completions/mean_terminated_length": 531.3333740234375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.4120845921450151, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03265729546546936, + "learning_rate": 2.145362316508819e-06, + "loss": -0.0231, + "num_tokens": 114866684.0, + "reward": 5.651445388793945, + "reward_std": 1.4455407857894897, + "rewards/accuracy_reward/mean": 4.905351161956787, + "rewards/accuracy_reward/std": 3.6158998012542725, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 584.109375, + "completions/mean_terminated_length": 584.109375, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.41268882175226584, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03438195586204529, + "learning_rate": 2.142851586420179e-06, + "loss": -0.0027, + "num_tokens": 115037363.0, + "reward": 1.6545546054840088, + "reward_std": 1.8080638647079468, + "rewards/accuracy_reward/mean": 0.9045547246932983, + "rewards/accuracy_reward/std": 2.3293333053588867, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 564.546875, + "completions/mean_terminated_length": 564.546875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.4132930513595166, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02928777039051056, + "learning_rate": 2.140338887943686e-06, + "loss": -0.0012, + "num_tokens": 115177670.0, + "reward": 3.0747437477111816, + "reward_std": 0.9347789287567139, + "rewards/accuracy_reward/mean": 2.3247437477111816, + "rewards/accuracy_reward/std": 3.4800260066986084, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 958.0, + "completions/max_terminated_length": 958.0, + "completions/mean_length": 560.90625, + "completions/mean_terminated_length": 560.90625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.41389728096676737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025344466790556908, + "learning_rate": 2.137824231114745e-06, + "loss": 0.0084, + "num_tokens": 115379664.0, + "reward": 5.9462385177612305, + "reward_std": 0.860710859298706, + "rewards/accuracy_reward/mean": 5.196238994598389, + "rewards/accuracy_reward/std": 3.293389081954956, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 611.0625, + "completions/mean_terminated_length": 611.0625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.41450151057401813, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.046451590955257416, + "learning_rate": 2.1353076259765834e-06, + "loss": 0.0197, + "num_tokens": 115562660.0, + "reward": 2.335764169692993, + "reward_std": 2.014655351638794, + "rewards/accuracy_reward/mean": 1.585763931274414, + "rewards/accuracy_reward/std": 3.142025947570801, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1892.0, + "completions/max_terminated_length": 1892.0, + "completions/mean_length": 628.078125, + "completions/mean_terminated_length": 628.078125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.4151057401812689, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03502468392252922, + "learning_rate": 2.1327890825802063e-06, + "loss": 0.0202, + "num_tokens": 115751593.0, + "reward": 1.1596343517303467, + "reward_std": 1.2506918907165527, + "rewards/accuracy_reward/mean": 0.4096343517303467, + "rewards/accuracy_reward/std": 1.85318922996521, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 465.15625, + "completions/mean_terminated_length": 465.15625, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.41570996978851965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.020140022039413452, + "learning_rate": 2.1302686109843637e-06, + "loss": 0.0137, + "num_tokens": 115960275.0, + "reward": 4.375261306762695, + "reward_std": 1.06148362159729, + "rewards/accuracy_reward/mean": 3.6252613067626953, + "rewards/accuracy_reward/std": 3.7884480953216553, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 594.140625, + "completions/mean_terminated_length": 594.140625, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.4163141993957704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0426773801445961, + "learning_rate": 2.127746221255505e-06, + "loss": -0.0013, + "num_tokens": 116120412.0, + "reward": 2.252953052520752, + "reward_std": 2.5191431045532227, + "rewards/accuracy_reward/mean": 1.502953052520752, + "rewards/accuracy_reward/std": 3.0263407230377197, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 564.421875, + "completions/mean_terminated_length": 564.421875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.4169184290030212, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020818527787923813, + "learning_rate": 2.125221923467741e-06, + "loss": -0.0016, + "num_tokens": 116271991.0, + "reward": 3.988729476928711, + "reward_std": 0.9178120493888855, + "rewards/accuracy_reward/mean": 3.238729476928711, + "rewards/accuracy_reward/std": 3.7354774475097656, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 593.515625, + "completions/mean_terminated_length": 593.515625, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.4175226586102719, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04917926341295242, + "learning_rate": 2.122695727702802e-06, + "loss": -0.0152, + "num_tokens": 116429336.0, + "reward": 3.316498279571533, + "reward_std": 1.8826661109924316, + "rewards/accuracy_reward/mean": 2.5782172679901123, + "rewards/accuracy_reward/std": 3.5249016284942627, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1086.0, + "completions/max_terminated_length": 1086.0, + "completions/mean_length": 535.375, + "completions/mean_terminated_length": 535.375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.41812688821752264, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04290121793746948, + "learning_rate": 2.12016764405e-06, + "loss": 0.0207, + "num_tokens": 116600656.0, + "reward": 3.3990466594696045, + "reward_std": 1.0326236486434937, + "rewards/accuracy_reward/mean": 2.6490468978881836, + "rewards/accuracy_reward/std": 3.6060352325439453, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 570.875, + "completions/mean_terminated_length": 570.875, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.4187311178247734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035567451268434525, + "learning_rate": 2.1176376826061854e-06, + "loss": 0.0029, + "num_tokens": 116778888.0, + "reward": 5.465149879455566, + "reward_std": 1.510164737701416, + "rewards/accuracy_reward/mean": 4.715150356292725, + "rewards/accuracy_reward/std": 3.6980011463165283, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 570.890625, + "completions/mean_terminated_length": 570.890625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.41933534743202416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04001881927251816, + "learning_rate": 2.11510585347571e-06, + "loss": 0.0038, + "num_tokens": 116967185.0, + "reward": 5.372127532958984, + "reward_std": 1.6203138828277588, + "rewards/accuracy_reward/mean": 4.622127532958984, + "rewards/accuracy_reward/std": 3.6778836250305176, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 428.453125, + "completions/mean_terminated_length": 428.453125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.4199395770392749, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.028359679505228996, + "learning_rate": 2.1125721667703836e-06, + "loss": 0.0033, + "num_tokens": 117125902.0, + "reward": 5.876052379608154, + "reward_std": 1.2264682054519653, + "rewards/accuracy_reward/mean": 5.126052379608154, + "rewards/accuracy_reward/std": 3.456382989883423, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1334.0, + "completions/max_terminated_length": 1334.0, + "completions/mean_length": 608.75, + "completions/mean_terminated_length": 608.75, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.4205438066465257, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.026959730312228203, + "learning_rate": 2.110036632609435e-06, + "loss": 0.021, + "num_tokens": 117287886.0, + "reward": 1.2694875001907349, + "reward_std": 0.8056918978691101, + "rewards/accuracy_reward/mean": 0.5194875001907349, + "rewards/accuracy_reward/std": 1.8153141736984253, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 559.953125, + "completions/mean_terminated_length": 559.953125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.42114803625377645, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03353206440806389, + "learning_rate": 2.107499261119472e-06, + "loss": 0.0245, + "num_tokens": 117444075.0, + "reward": 5.097177982330322, + "reward_std": 1.00447416305542, + "rewards/accuracy_reward/mean": 4.347177982330322, + "rewards/accuracy_reward/std": 3.6586711406707764, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 598.515625, + "completions/mean_terminated_length": 598.515625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.4217522658610272, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.022409873083233833, + "learning_rate": 2.1049600624344406e-06, + "loss": -0.0033, + "num_tokens": 117602796.0, + "reward": 2.8371081352233887, + "reward_std": 0.6455166935920715, + "rewards/accuracy_reward/mean": 2.0910141468048096, + "rewards/accuracy_reward/std": 3.3691842555999756, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 533.15625, + "completions/mean_terminated_length": 533.15625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.42235649546827797, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0414394736289978, + "learning_rate": 2.1024190466955846e-06, + "loss": -0.0119, + "num_tokens": 117824230.0, + "reward": 3.022568702697754, + "reward_std": 1.8104865550994873, + "rewards/accuracy_reward/mean": 2.272568702697754, + "rewards/accuracy_reward/std": 3.391857624053955, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 483.234375, + "completions/mean_terminated_length": 483.234375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.4229607250755287, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03836415708065033, + "learning_rate": 2.099876224051403e-06, + "loss": 0.0029, + "num_tokens": 118026725.0, + "reward": 2.923229217529297, + "reward_std": 2.0224008560180664, + "rewards/accuracy_reward/mean": 2.173229217529297, + "rewards/accuracy_reward/std": 3.407825469970703, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 617.46875, + "completions/mean_terminated_length": 617.46875, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.42356495468277944, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043495457619428635, + "learning_rate": 2.097331604657614e-06, + "loss": -0.0122, + "num_tokens": 118179603.0, + "reward": 4.699648380279541, + "reward_std": 1.8985133171081543, + "rewards/accuracy_reward/mean": 3.94964861869812, + "rewards/accuracy_reward/std": 3.763468027114868, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 579.875, + "completions/mean_terminated_length": 579.875, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.4241691842900302, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02474219910800457, + "learning_rate": 2.0947851986771102e-06, + "loss": 0.0127, + "num_tokens": 118346875.0, + "reward": 4.24544620513916, + "reward_std": 0.7337483763694763, + "rewards/accuracy_reward/mean": 3.4954464435577393, + "rewards/accuracy_reward/std": 3.6733076572418213, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 448.046875, + "completions/mean_terminated_length": 448.046875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.42477341389728096, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030378414317965508, + "learning_rate": 2.0922370162799195e-06, + "loss": -0.0037, + "num_tokens": 118470062.0, + "reward": 3.5910744667053223, + "reward_std": 0.9851620197296143, + "rewards/accuracy_reward/mean": 2.8449807167053223, + "rewards/accuracy_reward/std": 3.9064698219299316, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1665.0, + "completions/max_terminated_length": 1665.0, + "completions/mean_length": 581.0625, + "completions/mean_terminated_length": 581.0625, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.4253776435045317, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03876934573054314, + "learning_rate": 2.089687067643165e-06, + "loss": -0.0156, + "num_tokens": 118595778.0, + "reward": 3.544182777404785, + "reward_std": 1.5672262907028198, + "rewards/accuracy_reward/mean": 2.794182777404785, + "rewards/accuracy_reward/std": 3.635831356048584, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 491.828125, + "completions/mean_terminated_length": 491.828125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.4259818731117825, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.016097916290163994, + "learning_rate": 2.0871353629510237e-06, + "loss": -0.0003, + "num_tokens": 118733831.0, + "reward": 4.5666704177856445, + "reward_std": 0.4908318817615509, + "rewards/accuracy_reward/mean": 3.8166704177856445, + "rewards/accuracy_reward/std": 3.7621123790740967, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 609.8125, + "completions/mean_terminated_length": 609.8125, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.42658610271903324, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0466015487909317, + "learning_rate": 2.084581912394688e-06, + "loss": 0.0076, + "num_tokens": 118982363.0, + "reward": 3.6956608295440674, + "reward_std": 1.6739875078201294, + "rewards/accuracy_reward/mean": 2.9456610679626465, + "rewards/accuracy_reward/std": 3.7591166496276855, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 492.171875, + "completions/mean_terminated_length": 492.171875, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.427190332326284, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0191543810069561, + "learning_rate": 2.08202672617232e-06, + "loss": 0.0008, + "num_tokens": 119132694.0, + "reward": 5.940855503082275, + "reward_std": 0.7327542901039124, + "rewards/accuracy_reward/mean": 5.190855503082275, + "rewards/accuracy_reward/std": 3.3781652450561523, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 589.46875, + "completions/mean_terminated_length": 589.46875, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.42779456193353477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.050458136945962906, + "learning_rate": 2.0794698144890156e-06, + "loss": 0.0028, + "num_tokens": 119299076.0, + "reward": 4.2106122970581055, + "reward_std": 1.9883956909179688, + "rewards/accuracy_reward/mean": 3.4606122970581055, + "rewards/accuracy_reward/std": 3.7872214317321777, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 482.125, + "completions/mean_terminated_length": 482.125, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.4283987915407855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03880058228969574, + "learning_rate": 2.0769111875567615e-06, + "loss": 0.0202, + "num_tokens": 119422044.0, + "reward": 5.966324806213379, + "reward_std": 1.3389965295791626, + "rewards/accuracy_reward/mean": 5.216324806213379, + "rewards/accuracy_reward/std": 3.4530558586120605, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 562.671875, + "completions/mean_terminated_length": 562.671875, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.42900302114803623, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03315300866961479, + "learning_rate": 2.074350855594395e-06, + "loss": -0.0165, + "num_tokens": 119567863.0, + "reward": 3.6319422721862793, + "reward_std": 1.5855517387390137, + "rewards/accuracy_reward/mean": 2.8819422721862793, + "rewards/accuracy_reward/std": 3.6624889373779297, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 647.25, + "completions/mean_terminated_length": 647.25, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.429607250755287, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.028624195605516434, + "learning_rate": 2.071788828827562e-06, + "loss": 0.0258, + "num_tokens": 119710903.0, + "reward": 2.480682849884033, + "reward_std": 1.1548395156860352, + "rewards/accuracy_reward/mean": 1.7306828498840332, + "rewards/accuracy_reward/std": 3.0647194385528564, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 519.546875, + "completions/mean_terminated_length": 519.546875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.43021148036253776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03774598613381386, + "learning_rate": 2.0692251174886804e-06, + "loss": -0.0202, + "num_tokens": 119858250.0, + "reward": 4.937065601348877, + "reward_std": 1.7700341939926147, + "rewards/accuracy_reward/mean": 4.187065601348877, + "rewards/accuracy_reward/std": 3.7099030017852783, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 460.875, + "completions/mean_terminated_length": 460.875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.4308157099697885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027107805013656616, + "learning_rate": 2.066659731816893e-06, + "loss": -0.0104, + "num_tokens": 120014402.0, + "reward": 4.791017532348633, + "reward_std": 1.1237720251083374, + "rewards/accuracy_reward/mean": 4.041017055511475, + "rewards/accuracy_reward/std": 3.7076308727264404, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 555.125, + "completions/mean_terminated_length": 531.4285888671875, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.4314199395770393, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05424496531486511, + "learning_rate": 2.064092682058031e-06, + "loss": -0.0239, + "num_tokens": 120192762.0, + "reward": 4.198179721832275, + "reward_std": 3.044196128845215, + "rewards/accuracy_reward/mean": 3.4598984718322754, + "rewards/accuracy_reward/std": 3.6617960929870605, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 537.171875, + "completions/mean_terminated_length": 537.171875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.43202416918429004, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0531880222260952, + "learning_rate": 2.06152397846457e-06, + "loss": -0.0059, + "num_tokens": 120343013.0, + "reward": 6.0978312492370605, + "reward_std": 2.6905064582824707, + "rewards/accuracy_reward/mean": 5.3478312492370605, + "rewards/accuracy_reward/std": 3.342874765396118, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 539.0, + "completions/mean_terminated_length": 539.0, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.4326283987915408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04615502804517746, + "learning_rate": 2.058953631295594e-06, + "loss": -0.008, + "num_tokens": 120487061.0, + "reward": 4.414626121520996, + "reward_std": 2.5422630310058594, + "rewards/accuracy_reward/mean": 3.6646265983581543, + "rewards/accuracy_reward/std": 3.715939521789551, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 529.25, + "completions/mean_terminated_length": 529.25, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.43323262839879156, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03470669314265251, + "learning_rate": 2.056381650816749e-06, + "loss": 0.0012, + "num_tokens": 120770181.0, + "reward": 3.6260874271392822, + "reward_std": 0.9848096966743469, + "rewards/accuracy_reward/mean": 2.8760874271392822, + "rewards/accuracy_reward/std": 3.548102855682373, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 644.296875, + "completions/mean_terminated_length": 644.296875, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "epoch": 0.43383685800604227, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04262460768222809, + "learning_rate": 2.0538080473002035e-06, + "loss": -0.0087, + "num_tokens": 120917672.0, + "reward": 3.394796848297119, + "reward_std": 1.6214139461517334, + "rewards/accuracy_reward/mean": 2.644796848297119, + "rewards/accuracy_reward/std": 3.6004273891448975, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 519.921875, + "completions/mean_terminated_length": 519.921875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.43444108761329303, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0668153241276741, + "learning_rate": 2.051232831024611e-06, + "loss": 0.057, + "num_tokens": 121149795.0, + "reward": 5.129771709442139, + "reward_std": 3.3851962089538574, + "rewards/accuracy_reward/mean": 4.379771709442139, + "rewards/accuracy_reward/std": 3.6857519149780273, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 475.265625, + "completions/mean_terminated_length": 475.265625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.4350453172205438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0362289622426033, + "learning_rate": 2.048656012275064e-06, + "loss": 0.0237, + "num_tokens": 121290772.0, + "reward": 6.6540327072143555, + "reward_std": 1.5500786304473877, + "rewards/accuracy_reward/mean": 5.9040327072143555, + "rewards/accuracy_reward/std": 3.01130747795105, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 803.0, + "completions/max_terminated_length": 803.0, + "completions/mean_length": 578.734375, + "completions/mean_terminated_length": 578.734375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.43564954682779455, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.026829631999135017, + "learning_rate": 2.0460776013430557e-06, + "loss": -0.0049, + "num_tokens": 121454019.0, + "reward": 4.113104820251465, + "reward_std": 1.147188663482666, + "rewards/accuracy_reward/mean": 3.3631045818328857, + "rewards/accuracy_reward/std": 3.757610559463501, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 477.265625, + "completions/mean_terminated_length": 477.265625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.4362537764350453, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03278592228889465, + "learning_rate": 2.0434976085264375e-06, + "loss": -0.0362, + "num_tokens": 121596052.0, + "reward": 3.532292127609253, + "reward_std": 1.5980455875396729, + "rewards/accuracy_reward/mean": 2.782292366027832, + "rewards/accuracy_reward/std": 3.594203472137451, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 544.296875, + "completions/mean_terminated_length": 544.296875, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.4368580060422961, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03848155587911606, + "learning_rate": 2.04091604412938e-06, + "loss": 0.0105, + "num_tokens": 121789879.0, + "reward": 5.514378547668457, + "reward_std": 2.2287933826446533, + "rewards/accuracy_reward/mean": 4.764378547668457, + "rewards/accuracy_reward/std": 3.5967295169830322, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 458.140625, + "completions/mean_terminated_length": 458.140625, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.43746223564954684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02533535659313202, + "learning_rate": 2.0383329184623303e-06, + "loss": -0.0074, + "num_tokens": 121999376.0, + "reward": 4.615803241729736, + "reward_std": 1.1456729173660278, + "rewards/accuracy_reward/mean": 3.8658032417297363, + "rewards/accuracy_reward/std": 3.727808952331543, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 554.09375, + "completions/mean_terminated_length": 554.09375, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.4380664652567976, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0003046374476980418, + "learning_rate": 2.03574824184197e-06, + "loss": -0.0002, + "num_tokens": 122142374.0, + "reward": 0.9456546902656555, + "reward_std": 0.012776797637343407, + "rewards/accuracy_reward/mean": 0.19565469026565552, + "rewards/accuracy_reward/std": 0.20246651768684387, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 465.265625, + "completions/mean_terminated_length": 465.265625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.43867069486404836, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030228499323129654, + "learning_rate": 2.0331620245911762e-06, + "loss": -0.0045, + "num_tokens": 122344327.0, + "reward": 5.819157600402832, + "reward_std": 1.4096050262451172, + "rewards/accuracy_reward/mean": 5.069157600402832, + "rewards/accuracy_reward/std": 3.354362726211548, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 581.21875, + "completions/mean_terminated_length": 557.9365234375, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.43927492447129907, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05012798681855202, + "learning_rate": 2.0305742770389773e-06, + "loss": -0.042, + "num_tokens": 122465125.0, + "reward": 6.363447189331055, + "reward_std": 1.9647517204284668, + "rewards/accuracy_reward/mean": 5.6251654624938965, + "rewards/accuracy_reward/std": 3.216240167617798, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1363.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 560.0, + "completions/mean_terminated_length": 560.0, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.4398791540785498, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018356624990701675, + "learning_rate": 2.027985009520516e-06, + "loss": 0.0004, + "num_tokens": 122612773.0, + "reward": 4.198784351348877, + "reward_std": 0.7276179790496826, + "rewards/accuracy_reward/mean": 3.448784351348877, + "rewards/accuracy_reward/std": 3.767714262008667, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 601.484375, + "completions/mean_terminated_length": 601.484375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.4404833836858006, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03497301787137985, + "learning_rate": 2.0253942323770033e-06, + "loss": 0.0073, + "num_tokens": 122783572.0, + "reward": 4.235072135925293, + "reward_std": 1.9226880073547363, + "rewards/accuracy_reward/mean": 3.485071897506714, + "rewards/accuracy_reward/std": 3.773130178451538, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 479.171875, + "completions/mean_terminated_length": 479.171875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.44108761329305135, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046406690031290054, + "learning_rate": 2.0228019559556814e-06, + "loss": 0.0244, + "num_tokens": 122939599.0, + "reward": 4.904665946960449, + "reward_std": 2.538475751876831, + "rewards/accuracy_reward/mean": 4.154665946960449, + "rewards/accuracy_reward/std": 3.7506096363067627, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 590.453125, + "completions/mean_terminated_length": 590.453125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.4416918429003021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054214321076869965, + "learning_rate": 2.0202081906097786e-06, + "loss": 0.0026, + "num_tokens": 123084236.0, + "reward": 6.9324188232421875, + "reward_std": 2.3655712604522705, + "rewards/accuracy_reward/mean": 6.1824188232421875, + "rewards/accuracy_reward/std": 2.838960647583008, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 485.140625, + "completions/mean_terminated_length": 485.140625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.4422960725075529, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.042421918362379074, + "learning_rate": 2.017612946698471e-06, + "loss": -0.0119, + "num_tokens": 123223781.0, + "reward": 2.380335807800293, + "reward_std": 1.762880802154541, + "rewards/accuracy_reward/mean": 1.630335807800293, + "rewards/accuracy_reward/std": 3.1054422855377197, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 472.828125, + "completions/mean_terminated_length": 472.828125, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.44290030211480363, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0006566231022588909, + "learning_rate": 2.0150162345868397e-06, + "loss": -0.0005, + "num_tokens": 123393882.0, + "reward": 6.3202056884765625, + "reward_std": 0.03816165775060654, + "rewards/accuracy_reward/mean": 5.570204734802246, + "rewards/accuracy_reward/std": 3.241830348968506, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 755.140625, + "completions/mean_terminated_length": 734.6190795898438, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.4435045317220544, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04529723525047302, + "learning_rate": 2.0124180646458295e-06, + "loss": 0.0119, + "num_tokens": 123579923.0, + "reward": 2.051043748855591, + "reward_std": 1.932809591293335, + "rewards/accuracy_reward/mean": 1.3127624988555908, + "rewards/accuracy_reward/std": 2.8322837352752686, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 626.515625, + "completions/mean_terminated_length": 626.515625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.44410876132930516, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03743002936244011, + "learning_rate": 2.009818447252207e-06, + "loss": 0.0107, + "num_tokens": 123760452.0, + "reward": 5.63841438293457, + "reward_std": 1.4526405334472656, + "rewards/accuracy_reward/mean": 4.88841438293457, + "rewards/accuracy_reward/std": 3.551651954650879, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 570.921875, + "completions/mean_terminated_length": 570.921875, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.4447129909365559, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045301347970962524, + "learning_rate": 2.0072173927885208e-06, + "loss": 0.0175, + "num_tokens": 123900031.0, + "reward": 5.394380569458008, + "reward_std": 2.0846314430236816, + "rewards/accuracy_reward/mean": 4.644380569458008, + "rewards/accuracy_reward/std": 3.647627592086792, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 503.828125, + "completions/mean_terminated_length": 503.828125, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.4453172205438066, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034450046718120575, + "learning_rate": 2.004614911643058e-06, + "loss": 0.0004, + "num_tokens": 124053508.0, + "reward": 3.9766921997070312, + "reward_std": 1.4494253396987915, + "rewards/accuracy_reward/mean": 3.2305984497070312, + "rewards/accuracy_reward/std": 3.604681968688965, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 568.046875, + "completions/mean_terminated_length": 568.046875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.4459214501510574, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02665473148226738, + "learning_rate": 2.002011014209805e-06, + "loss": -0.003, + "num_tokens": 124201255.0, + "reward": 5.971526145935059, + "reward_std": 0.7958590984344482, + "rewards/accuracy_reward/mean": 5.225432872772217, + "rewards/accuracy_reward/std": 3.445213794708252, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1421.0, + "completions/max_terminated_length": 1421.0, + "completions/mean_length": 640.53125, + "completions/mean_terminated_length": 640.53125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.44652567975830815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04437312111258507, + "learning_rate": 1.9994057108884033e-06, + "loss": 0.0133, + "num_tokens": 124379993.0, + "reward": 5.14116096496582, + "reward_std": 2.306351661682129, + "rewards/accuracy_reward/mean": 4.3911614418029785, + "rewards/accuracy_reward/std": 3.593148708343506, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 470.46875, + "completions/mean_terminated_length": 470.46875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.4471299093655589, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048284862190485, + "learning_rate": 1.996799012084109e-06, + "loss": 0.0062, + "num_tokens": 124535399.0, + "reward": 6.731029510498047, + "reward_std": 2.5870158672332764, + "rewards/accuracy_reward/mean": 5.981029510498047, + "rewards/accuracy_reward/std": 3.0644712448120117, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 520.984375, + "completions/mean_terminated_length": 520.984375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.44773413897280967, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04327604919672012, + "learning_rate": 1.9941909282077543e-06, + "loss": 0.0158, + "num_tokens": 124681782.0, + "reward": 4.106295108795166, + "reward_std": 1.5598186254501343, + "rewards/accuracy_reward/mean": 3.360201597213745, + "rewards/accuracy_reward/std": 3.7543938159942627, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 515.4375, + "completions/mean_terminated_length": 515.4375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.44833836858006043, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04828551784157753, + "learning_rate": 1.9915814696757003e-06, + "loss": -0.0125, + "num_tokens": 124858818.0, + "reward": 2.3139312267303467, + "reward_std": 2.725775718688965, + "rewards/accuracy_reward/mean": 1.5639312267303467, + "rewards/accuracy_reward/std": 3.1549417972564697, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 619.734375, + "completions/mean_terminated_length": 619.734375, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.4489425981873112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02513016387820244, + "learning_rate": 1.9889706469098e-06, + "loss": -0.0013, + "num_tokens": 125065281.0, + "reward": 0.942882776260376, + "reward_std": 1.0598094463348389, + "rewards/accuracy_reward/mean": 0.19288280606269836, + "rewards/accuracy_reward/std": 1.3327537775039673, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 634.484375, + "completions/mean_terminated_length": 634.484375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.44954682779456195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0525844469666481, + "learning_rate": 1.9863584703373534e-06, + "loss": -0.044, + "num_tokens": 125321888.0, + "reward": 6.12529993057251, + "reward_std": 2.214630126953125, + "rewards/accuracy_reward/mean": 5.37529993057251, + "rewards/accuracy_reward/std": 3.4179675579071045, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 710.046875, + "completions/mean_terminated_length": 710.046875, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.4501510574018127, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02806648053228855, + "learning_rate": 1.9837449503910687e-06, + "loss": 0.0157, + "num_tokens": 125494227.0, + "reward": 3.0439205169677734, + "reward_std": 1.1318395137786865, + "rewards/accuracy_reward/mean": 2.2939205169677734, + "rewards/accuracy_reward/std": 3.3417117595672607, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 531.78125, + "completions/mean_terminated_length": 531.78125, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.4507552870090634, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01648090034723282, + "learning_rate": 1.9811300975090196e-06, + "loss": -0.0065, + "num_tokens": 125641349.0, + "reward": 6.077434062957764, + "reward_std": 0.647053062915802, + "rewards/accuracy_reward/mean": 5.327434062957764, + "rewards/accuracy_reward/std": 3.3589887619018555, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 616.015625, + "completions/mean_terminated_length": 616.015625, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.4513595166163142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04022655636072159, + "learning_rate": 1.978513922134602e-06, + "loss": -0.0383, + "num_tokens": 125796326.0, + "reward": 5.05560302734375, + "reward_std": 2.2670931816101074, + "rewards/accuracy_reward/mean": 4.30560302734375, + "rewards/accuracy_reward/std": 3.7116763591766357, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 532.40625, + "completions/mean_terminated_length": 532.40625, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.45196374622356494, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022309819236397743, + "learning_rate": 1.9758964347164954e-06, + "loss": 0.0096, + "num_tokens": 125917696.0, + "reward": 6.107173919677734, + "reward_std": 0.9187861084938049, + "rewards/accuracy_reward/mean": 5.357173919677734, + "rewards/accuracy_reward/std": 3.360952138900757, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 475.578125, + "completions/mean_terminated_length": 475.578125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.4525679758308157, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03215288370847702, + "learning_rate": 1.973277645708618e-06, + "loss": -0.0028, + "num_tokens": 126073125.0, + "reward": 3.732945203781128, + "reward_std": 1.4811968803405762, + "rewards/accuracy_reward/mean": 2.982945442199707, + "rewards/accuracy_reward/std": 3.703084707260132, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 555.796875, + "completions/mean_terminated_length": 555.796875, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.45317220543806647, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041108082979917526, + "learning_rate": 1.970657565570087e-06, + "loss": -0.0033, + "num_tokens": 126257112.0, + "reward": 5.191970348358154, + "reward_std": 2.055255889892578, + "rewards/accuracy_reward/mean": 4.445876121520996, + "rewards/accuracy_reward/std": 3.724428415298462, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 565.1875, + "completions/mean_terminated_length": 565.1875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.45377643504531723, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03227810561656952, + "learning_rate": 1.968036204765176e-06, + "loss": 0.0046, + "num_tokens": 126441524.0, + "reward": 4.858595371246338, + "reward_std": 1.449907898902893, + "rewards/accuracy_reward/mean": 4.108595371246338, + "rewards/accuracy_reward/std": 3.5727880001068115, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 558.40625, + "completions/mean_terminated_length": 534.761962890625, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.454380664652568, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04583609104156494, + "learning_rate": 1.965413573763274e-06, + "loss": -0.0156, + "num_tokens": 126615966.0, + "reward": 4.866100788116455, + "reward_std": 1.9026706218719482, + "rewards/accuracy_reward/mean": 4.127819538116455, + "rewards/accuracy_reward/std": 3.8035290241241455, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 436.765625, + "completions/mean_terminated_length": 436.765625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.45498489425981875, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.029607100412249565, + "learning_rate": 1.962789683038843e-06, + "loss": 0.0065, + "num_tokens": 126724735.0, + "reward": 3.9761130809783936, + "reward_std": 1.2007685899734497, + "rewards/accuracy_reward/mean": 3.2261130809783936, + "rewards/accuracy_reward/std": 3.687981367111206, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 521.6875, + "completions/mean_terminated_length": 521.6875, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.4555891238670695, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0021817598026245832, + "learning_rate": 1.9601645430713737e-06, + "loss": -0.0006, + "num_tokens": 126926043.0, + "reward": 2.6005187034606934, + "reward_std": 0.10489372909069061, + "rewards/accuracy_reward/mean": 1.8544249534606934, + "rewards/accuracy_reward/std": 3.262376070022583, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 585.734375, + "completions/mean_terminated_length": 585.734375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.4561933534743202, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04636941850185394, + "learning_rate": 1.9575381643453504e-06, + "loss": 0.0073, + "num_tokens": 127119930.0, + "reward": 3.2107205390930176, + "reward_std": 2.3072314262390137, + "rewards/accuracy_reward/mean": 2.4607203006744385, + "rewards/accuracy_reward/std": 3.5169293880462646, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 536.75, + "completions/mean_terminated_length": 536.75, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.456797583081571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033422552049160004, + "learning_rate": 1.954910557350202e-06, + "loss": -0.0054, + "num_tokens": 127263178.0, + "reward": 2.8348031044006348, + "reward_std": 1.1038528680801392, + "rewards/accuracy_reward/mean": 2.0848031044006348, + "rewards/accuracy_reward/std": 3.249577522277832, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 537.4375, + "completions/mean_terminated_length": 537.4375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.45740181268882174, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.015458785928785801, + "learning_rate": 1.952281732580263e-06, + "loss": -0.005, + "num_tokens": 127394518.0, + "reward": 0.8743484020233154, + "reward_std": 0.49739375710487366, + "rewards/accuracy_reward/mean": 0.12434843927621841, + "rewards/accuracy_reward/std": 0.9383406639099121, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 614.78125, + "completions/mean_terminated_length": 614.78125, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.4580060422960725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035018883645534515, + "learning_rate": 1.949651700534733e-06, + "loss": -0.0163, + "num_tokens": 127578952.0, + "reward": 2.6631920337677, + "reward_std": 2.0662026405334473, + "rewards/accuracy_reward/mean": 1.9131921529769897, + "rewards/accuracy_reward/std": 3.357705593109131, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1176.0, + "completions/mean_length": 652.0625, + "completions/mean_terminated_length": 629.90478515625, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.45861027190332326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.021633055061101913, + "learning_rate": 1.9470204717176313e-06, + "loss": -0.0132, + "num_tokens": 127732636.0, + "reward": 6.155335903167725, + "reward_std": 0.5916734337806702, + "rewards/accuracy_reward/mean": 5.417055130004883, + "rewards/accuracy_reward/std": 3.3598575592041016, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 532.671875, + "completions/mean_terminated_length": 532.671875, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.459214501510574, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04914315789937973, + "learning_rate": 1.944388056637759e-06, + "loss": -0.0077, + "num_tokens": 127871335.0, + "reward": 2.608334541320801, + "reward_std": 2.395906686782837, + "rewards/accuracy_reward/mean": 1.8583344221115112, + "rewards/accuracy_reward/std": 3.2442569732666016, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 676.5625, + "completions/mean_terminated_length": 676.5625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.4598187311178248, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01489989086985588, + "learning_rate": 1.941754465808654e-06, + "loss": 0.0058, + "num_tokens": 128098491.0, + "reward": 2.831993579864502, + "reward_std": 0.44284749031066895, + "rewards/accuracy_reward/mean": 2.081993579864502, + "rewards/accuracy_reward/std": 3.253831148147583, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1755.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 577.28125, + "completions/mean_terminated_length": 577.28125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.46042296072507555, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016773513052612543, + "learning_rate": 1.9391197097485493e-06, + "loss": -0.0018, + "num_tokens": 128236397.0, + "reward": 4.576037406921387, + "reward_std": 0.05717041343450546, + "rewards/accuracy_reward/mean": 3.8260371685028076, + "rewards/accuracy_reward/std": 3.6741137504577637, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 486.875, + "completions/mean_terminated_length": 486.875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.4610271903323263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030882038176059723, + "learning_rate": 1.9364837989803334e-06, + "loss": -0.0054, + "num_tokens": 128364565.0, + "reward": 5.188986301422119, + "reward_std": 0.9784811735153198, + "rewards/accuracy_reward/mean": 4.438986301422119, + "rewards/accuracy_reward/std": 3.672318458557129, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 489.84375, + "completions/mean_terminated_length": 489.84375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.461631419939577, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022949041798710823, + "learning_rate": 1.933846744031505e-06, + "loss": 0.0107, + "num_tokens": 128578027.0, + "reward": 4.205471992492676, + "reward_std": 0.7265978455543518, + "rewards/accuracy_reward/mean": 3.455471992492676, + "rewards/accuracy_reward/std": 3.7748780250549316, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 557.984375, + "completions/mean_terminated_length": 534.3333740234375, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.4622356495468278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04408833757042885, + "learning_rate": 1.9312085554341332e-06, + "loss": -0.0213, + "num_tokens": 128719530.0, + "reward": 6.658767223358154, + "reward_std": 1.7446448802947998, + "rewards/accuracy_reward/mean": 5.920485973358154, + "rewards/accuracy_reward/std": 2.9847700595855713, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 715.9375, + "completions/mean_terminated_length": 715.9375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.46283987915407854, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.033212386071681976, + "learning_rate": 1.928569243724815e-06, + "loss": -0.0051, + "num_tokens": 128882342.0, + "reward": 1.3931422233581543, + "reward_std": 1.037135362625122, + "rewards/accuracy_reward/mean": 0.6548609137535095, + "rewards/accuracy_reward/std": 2.16731333732605, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 491.921875, + "completions/mean_terminated_length": 491.921875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.4634441087613293, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03580579161643982, + "learning_rate": 1.9259288194446327e-06, + "loss": 0.017, + "num_tokens": 129070129.0, + "reward": 5.704895973205566, + "reward_std": 1.7876882553100586, + "rewards/accuracy_reward/mean": 4.954895973205566, + "rewards/accuracy_reward/std": 3.526793956756592, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 566.203125, + "completions/mean_terminated_length": 566.203125, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.46404833836858006, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02335280552506447, + "learning_rate": 1.9232872931391114e-06, + "loss": 0.0061, + "num_tokens": 129239566.0, + "reward": 4.5764312744140625, + "reward_std": 0.49276450276374817, + "rewards/accuracy_reward/mean": 3.8264312744140625, + "rewards/accuracy_reward/std": 3.780052661895752, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1607.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 627.90625, + "completions/mean_terminated_length": 627.90625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.4646525679758308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029818322509527206, + "learning_rate": 1.920644675358179e-06, + "loss": 0.0146, + "num_tokens": 129398344.0, + "reward": 3.111370325088501, + "reward_std": 1.4748573303222656, + "rewards/accuracy_reward/mean": 2.361370325088501, + "rewards/accuracy_reward/std": 3.3778750896453857, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1163.0, + "completions/mean_length": 622.65625, + "completions/mean_terminated_length": 600.0317993164062, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.4652567975830816, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04222160205245018, + "learning_rate": 1.918000976656121e-06, + "loss": 0.033, + "num_tokens": 129572066.0, + "reward": 3.0873734951019287, + "reward_std": 1.8270856142044067, + "rewards/accuracy_reward/mean": 2.3412795066833496, + "rewards/accuracy_reward/std": 3.449958086013794, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 738.328125, + "completions/mean_terminated_length": 738.328125, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.46586102719033234, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04878152161836624, + "learning_rate": 1.9153562075915415e-06, + "loss": 0.0026, + "num_tokens": 129730247.0, + "reward": 3.0375797748565674, + "reward_std": 2.437366247177124, + "rewards/accuracy_reward/mean": 2.2875797748565674, + "rewards/accuracy_reward/std": 3.2786834239959717, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 643.984375, + "completions/mean_terminated_length": 643.984375, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.4664652567975831, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04874156042933464, + "learning_rate": 1.9127103787273176e-06, + "loss": 0.0033, + "num_tokens": 129889558.0, + "reward": 5.732639312744141, + "reward_std": 2.376161813735962, + "rewards/accuracy_reward/mean": 4.982639312744141, + "rewards/accuracy_reward/std": 3.4496359825134277, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1325.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 690.703125, + "completions/mean_terminated_length": 690.703125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.4670694864048338, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.043474409729242325, + "learning_rate": 1.9100635006305613e-06, + "loss": 0.0044, + "num_tokens": 130026131.0, + "reward": 3.5239548683166504, + "reward_std": 1.737228512763977, + "rewards/accuracy_reward/mean": 2.7739548683166504, + "rewards/accuracy_reward/std": 3.678656816482544, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 491.203125, + "completions/mean_terminated_length": 491.203125, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.4676737160120846, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.046511825174093246, + "learning_rate": 1.907415583872574e-06, + "loss": 0.0453, + "num_tokens": 130201424.0, + "reward": 4.930031776428223, + "reward_std": 1.7894848585128784, + "rewards/accuracy_reward/mean": 4.180031776428223, + "rewards/accuracy_reward/std": 3.722571611404419, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 600.109375, + "completions/mean_terminated_length": 600.109375, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.46827794561933533, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05703055486083031, + "learning_rate": 1.9047666390288048e-06, + "loss": -0.0083, + "num_tokens": 130379623.0, + "reward": 3.616147041320801, + "reward_std": 2.519304037094116, + "rewards/accuracy_reward/mean": 2.866147041320801, + "rewards/accuracy_reward/std": 3.5801241397857666, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 576.09375, + "completions/mean_terminated_length": 576.09375, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.4688821752265861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04796629026532173, + "learning_rate": 1.9021166766788102e-06, + "loss": 0.0182, + "num_tokens": 130614877.0, + "reward": 4.219054698944092, + "reward_std": 1.9144253730773926, + "rewards/accuracy_reward/mean": 3.472960948944092, + "rewards/accuracy_reward/std": 3.7603261470794678, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 472.3125, + "completions/mean_terminated_length": 472.3125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.46948640483383686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.021675636991858482, + "learning_rate": 1.8994657074062095e-06, + "loss": 0.0038, + "num_tokens": 130747073.0, + "reward": 6.56119441986084, + "reward_std": 0.6594287157058716, + "rewards/accuracy_reward/mean": 5.81119441986084, + "rewards/accuracy_reward/std": 3.044766426086426, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 518.40625, + "completions/mean_terminated_length": 518.40625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.4700906344410876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04565272480249405, + "learning_rate": 1.8968137417986436e-06, + "loss": 0.0097, + "num_tokens": 130890347.0, + "reward": 3.550175189971924, + "reward_std": 2.3371968269348145, + "rewards/accuracy_reward/mean": 2.800175189971924, + "rewards/accuracy_reward/std": 3.630014657974243, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 458.1875, + "completions/mean_terminated_length": 458.1875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.4706948640483384, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032743390649557114, + "learning_rate": 1.8941607904477324e-06, + "loss": 0.0106, + "num_tokens": 131007975.0, + "reward": 6.168802261352539, + "reward_std": 1.1260358095169067, + "rewards/accuracy_reward/mean": 5.430521011352539, + "rewards/accuracy_reward/std": 3.33003306388855, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 548.3125, + "completions/mean_terminated_length": 548.3125, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.47129909365558914, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05086996406316757, + "learning_rate": 1.8915068639490344e-06, + "loss": -0.0153, + "num_tokens": 131180603.0, + "reward": 4.587976455688477, + "reward_std": 2.4226438999176025, + "rewards/accuracy_reward/mean": 3.8379764556884766, + "rewards/accuracy_reward/std": 3.74933123588562, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 642.421875, + "completions/mean_terminated_length": 642.421875, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "epoch": 0.4719033232628399, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030137859284877777, + "learning_rate": 1.888851972902001e-06, + "loss": 0.0092, + "num_tokens": 131319286.0, + "reward": 3.065234422683716, + "reward_std": 0.9072188138961792, + "rewards/accuracy_reward/mean": 2.315234422683716, + "rewards/accuracy_reward/std": 3.4974405765533447, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 563.53125, + "completions/mean_terminated_length": 563.53125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.4725075528700906, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.030875688418745995, + "learning_rate": 1.8861961279099356e-06, + "loss": -0.0073, + "num_tokens": 131490552.0, + "reward": 2.930987596511841, + "reward_std": 0.7696576714515686, + "rewards/accuracy_reward/mean": 2.180987596511841, + "rewards/accuracy_reward/std": 3.419809103012085, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 635.125, + "completions/mean_terminated_length": 635.125, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.47311178247734137, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0008951526251621544, + "learning_rate": 1.8835393395799534e-06, + "loss": -0.0003, + "num_tokens": 131729248.0, + "reward": 2.6190531253814697, + "reward_std": 0.03093307837843895, + "rewards/accuracy_reward/mean": 1.8690531253814697, + "rewards/accuracy_reward/std": 3.2491302490234375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 522.15625, + "completions/mean_terminated_length": 522.15625, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.47371601208459213, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03554685786366463, + "learning_rate": 1.8808816185229356e-06, + "loss": 0.0116, + "num_tokens": 131924826.0, + "reward": 4.889540672302246, + "reward_std": 1.4126888513565063, + "rewards/accuracy_reward/mean": 4.139540672302246, + "rewards/accuracy_reward/std": 3.7139530181884766, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 457.4375, + "completions/mean_terminated_length": 457.4375, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.4743202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03438882529735565, + "learning_rate": 1.8782229753534894e-06, + "loss": 0.015, + "num_tokens": 132054902.0, + "reward": 6.981789588928223, + "reward_std": 1.5015552043914795, + "rewards/accuracy_reward/mean": 6.231789588928223, + "rewards/accuracy_reward/std": 2.7496931552886963, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 820.0, + "completions/max_terminated_length": 820.0, + "completions/mean_length": 503.375, + "completions/mean_terminated_length": 503.375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.47492447129909365, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04490378871560097, + "learning_rate": 1.8755634206899036e-06, + "loss": 0.0034, + "num_tokens": 132281326.0, + "reward": 5.763540744781494, + "reward_std": 1.677807092666626, + "rewards/accuracy_reward/mean": 5.013540267944336, + "rewards/accuracy_reward/std": 3.5852715969085693, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1511.0, + "completions/max_terminated_length": 1511.0, + "completions/mean_length": 706.671875, + "completions/mean_terminated_length": 706.671875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.4755287009063444, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.025427592918276787, + "learning_rate": 1.8729029651541091e-06, + "loss": -0.0158, + "num_tokens": 132518841.0, + "reward": 4.224914073944092, + "reward_std": 1.0657188892364502, + "rewards/accuracy_reward/mean": 3.474914073944092, + "rewards/accuracy_reward/std": 4.022206783294678, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 521.546875, + "completions/mean_terminated_length": 521.546875, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.4761329305135952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049908265471458435, + "learning_rate": 1.8702416193716342e-06, + "loss": -0.0042, + "num_tokens": 132746236.0, + "reward": 4.846438884735107, + "reward_std": 2.2417895793914795, + "rewards/accuracy_reward/mean": 4.096439361572266, + "rewards/accuracy_reward/std": 3.7340164184570312, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 584.859375, + "completions/mean_terminated_length": 537.6612548828125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.47673716012084594, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03821206092834473, + "learning_rate": 1.8675793939715616e-06, + "loss": -0.0034, + "num_tokens": 132916051.0, + "reward": 5.260810852050781, + "reward_std": 1.0851414203643799, + "rewards/accuracy_reward/mean": 4.534248352050781, + "rewards/accuracy_reward/std": 3.7116520404815674, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 599.953125, + "completions/mean_terminated_length": 599.953125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.4773413897280967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04582447558641434, + "learning_rate": 1.864916299586489e-06, + "loss": 0.0133, + "num_tokens": 133064992.0, + "reward": 3.3902344703674316, + "reward_std": 1.8052178621292114, + "rewards/accuracy_reward/mean": 2.6402344703674316, + "rewards/accuracy_reward/std": 3.6511125564575195, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1076.0, + "completions/mean_length": 725.953125, + "completions/mean_terminated_length": 704.9683227539062, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "epoch": 0.4779456193353474, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04019676521420479, + "learning_rate": 1.8622523468524828e-06, + "loss": 0.0053, + "num_tokens": 133229053.0, + "reward": 4.495401382446289, + "reward_std": 2.1700265407562256, + "rewards/accuracy_reward/mean": 3.757120132446289, + "rewards/accuracy_reward/std": 3.6960525512695312, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 611.5625, + "completions/mean_terminated_length": 611.5625, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.47854984894259817, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003169614588841796, + "learning_rate": 1.8595875464090389e-06, + "loss": -0.0006, + "num_tokens": 133471649.0, + "reward": 0.8342468738555908, + "reward_std": 0.09040440618991852, + "rewards/accuracy_reward/mean": 0.08424687385559082, + "rewards/accuracy_reward/std": 0.22838561236858368, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 564.8125, + "completions/mean_terminated_length": 564.8125, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.4791540785498489, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03866303712129593, + "learning_rate": 1.8569219088990376e-06, + "loss": 0.022, + "num_tokens": 133625813.0, + "reward": 5.955423355102539, + "reward_std": 1.495927333831787, + "rewards/accuracy_reward/mean": 5.205423355102539, + "rewards/accuracy_reward/std": 3.660696506500244, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 746.84375, + "completions/mean_terminated_length": 726.1904907226562, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.4797583081570997, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.050092700868844986, + "learning_rate": 1.8542554449687045e-06, + "loss": -0.0248, + "num_tokens": 133823963.0, + "reward": 2.3445279598236084, + "reward_std": 1.936914324760437, + "rewards/accuracy_reward/mean": 1.5984344482421875, + "rewards/accuracy_reward/std": 2.9979569911956787, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 556.359375, + "completions/mean_terminated_length": 556.359375, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.48036253776435045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036717694252729416, + "learning_rate": 1.8515881652675637e-06, + "loss": -0.0114, + "num_tokens": 133970050.0, + "reward": 6.502540588378906, + "reward_std": 1.6019220352172852, + "rewards/accuracy_reward/mean": 5.756446838378906, + "rewards/accuracy_reward/std": 3.0715959072113037, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 676.0, + "completions/max_terminated_length": 676.0, + "completions/mean_length": 480.421875, + "completions/mean_terminated_length": 480.421875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.4809667673716012, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0230408962816, + "learning_rate": 1.848920080448398e-06, + "loss": -0.004, + "num_tokens": 134148573.0, + "reward": 5.045223236083984, + "reward_std": 0.908596396446228, + "rewards/accuracy_reward/mean": 4.295223236083984, + "rewards/accuracy_reward/std": 3.698347806930542, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 482.296875, + "completions/mean_terminated_length": 482.296875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.481570996978852, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.026639709249138832, + "learning_rate": 1.8462512011672055e-06, + "loss": -0.0103, + "num_tokens": 134270624.0, + "reward": 2.9644954204559326, + "reward_std": 0.7555176615715027, + "rewards/accuracy_reward/mean": 2.2144951820373535, + "rewards/accuracy_reward/std": 3.435016393661499, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1288.0, + "completions/max_terminated_length": 1288.0, + "completions/mean_length": 626.421875, + "completions/mean_terminated_length": 626.421875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.48217522658610273, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037741925567388535, + "learning_rate": 1.843581538083159e-06, + "loss": 0.0171, + "num_tokens": 134459435.0, + "reward": 4.613154411315918, + "reward_std": 2.128770589828491, + "rewards/accuracy_reward/mean": 3.863154411315918, + "rewards/accuracy_reward/std": 3.7305023670196533, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1935.0, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 695.390625, + "completions/mean_terminated_length": 695.390625, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.4827794561933535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.017788581550121307, + "learning_rate": 1.8409111018585587e-06, + "loss": 0.0041, + "num_tokens": 134658116.0, + "reward": 6.536808013916016, + "reward_std": 0.4600452184677124, + "rewards/accuracy_reward/mean": 5.786808013916016, + "rewards/accuracy_reward/std": 3.0001628398895264, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 658.84375, + "completions/mean_terminated_length": 590.5245361328125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.48338368580060426, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00256089074537158, + "learning_rate": 1.8382399031587952e-06, + "loss": -0.0269, + "num_tokens": 134862698.0, + "reward": 2.540104627609253, + "reward_std": 0.2611614465713501, + "rewards/accuracy_reward/mean": 1.8252609968185425, + "rewards/accuracy_reward/std": 3.3023152351379395, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1394.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 624.984375, + "completions/mean_terminated_length": 624.984375, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.48398791540785496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051799286156892776, + "learning_rate": 1.8355679526523035e-06, + "loss": 0.0004, + "num_tokens": 135074793.0, + "reward": 2.461937427520752, + "reward_std": 2.837730646133423, + "rewards/accuracy_reward/mean": 1.711937427520752, + "rewards/accuracy_reward/std": 3.0502030849456787, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 489.734375, + "completions/mean_terminated_length": 489.734375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.4845921450151057, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05020559951663017, + "learning_rate": 1.832895261010521e-06, + "loss": 0.022, + "num_tokens": 135204120.0, + "reward": 3.4081625938415527, + "reward_std": 2.4368677139282227, + "rewards/accuracy_reward/mean": 2.6581625938415527, + "rewards/accuracy_reward/std": 3.5772359371185303, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1381.0, + "completions/max_terminated_length": 1381.0, + "completions/mean_length": 657.796875, + "completions/mean_terminated_length": 657.796875, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.4851963746223565, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030149638652801514, + "learning_rate": 1.8302218389078451e-06, + "loss": 0.0083, + "num_tokens": 135395723.0, + "reward": 3.40691876411438, + "reward_std": 1.0165140628814697, + "rewards/accuracy_reward/mean": 2.656919002532959, + "rewards/accuracy_reward/std": 3.595900297164917, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 656.046875, + "completions/mean_terminated_length": 656.046875, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.48580060422960725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003484098007902503, + "learning_rate": 1.8275476970215906e-06, + "loss": -0.0015, + "num_tokens": 135557102.0, + "reward": 2.69340181350708, + "reward_std": 0.16348513960838318, + "rewards/accuracy_reward/mean": 1.943401575088501, + "rewards/accuracy_reward/std": 3.226918935775757, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 614.953125, + "completions/mean_terminated_length": 614.953125, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.486404833836858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04568734019994736, + "learning_rate": 1.8248728460319478e-06, + "loss": 0.0079, + "num_tokens": 135714219.0, + "reward": 4.639148712158203, + "reward_std": 2.2154595851898193, + "rewards/accuracy_reward/mean": 3.889148235321045, + "rewards/accuracy_reward/std": 3.7234280109405518, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1729.0, + "completions/mean_length": 763.53125, + "completions/mean_terminated_length": 722.0967407226562, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.48700906344410877, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039834655821323395, + "learning_rate": 1.8221972966219372e-06, + "loss": -0.0652, + "num_tokens": 135883981.0, + "reward": 4.985022068023682, + "reward_std": 1.880674123764038, + "rewards/accuracy_reward/mean": 4.258459091186523, + "rewards/accuracy_reward/std": 3.7194602489471436, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 504.65625, + "completions/mean_terminated_length": 504.65625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.48761329305135953, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029188496991991997, + "learning_rate": 1.8195210594773712e-06, + "loss": -0.0114, + "num_tokens": 136072535.0, + "reward": 7.376169681549072, + "reward_std": 0.9823890328407288, + "rewards/accuracy_reward/mean": 6.6261701583862305, + "rewards/accuracy_reward/std": 2.3409230709075928, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 568.796875, + "completions/mean_terminated_length": 568.796875, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.4882175226586103, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01877441629767418, + "learning_rate": 1.816844145286807e-06, + "loss": -0.0014, + "num_tokens": 136248730.0, + "reward": 4.400476932525635, + "reward_std": 0.53583824634552, + "rewards/accuracy_reward/mean": 3.6504764556884766, + "rewards/accuracy_reward/std": 3.7064149379730225, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 525.8125, + "completions/mean_terminated_length": 525.8125, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.48882175226586105, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.03335683420300484, + "learning_rate": 1.8141665647415062e-06, + "loss": 0.0031, + "num_tokens": 136387182.0, + "reward": 1.6961359977722168, + "reward_std": 0.9481968283653259, + "rewards/accuracy_reward/mean": 0.946135938167572, + "rewards/accuracy_reward/std": 2.480565071105957, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1261.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 524.703125, + "completions/mean_terminated_length": 524.703125, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.48942598187311176, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029855089262127876, + "learning_rate": 1.8114883285353925e-06, + "loss": -0.0149, + "num_tokens": 136555899.0, + "reward": 6.257462501525879, + "reward_std": 1.1052496433258057, + "rewards/accuracy_reward/mean": 5.507462501525879, + "rewards/accuracy_reward/std": 3.1029133796691895, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1217.0, + "completions/max_terminated_length": 1217.0, + "completions/mean_length": 548.140625, + "completions/mean_terminated_length": 548.140625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "epoch": 0.4900302114803625, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0412583127617836, + "learning_rate": 1.808809447365008e-06, + "loss": -0.0026, + "num_tokens": 136732292.0, + "reward": 4.1951422691345215, + "reward_std": 2.3825371265411377, + "rewards/accuracy_reward/mean": 3.4451422691345215, + "rewards/accuracy_reward/std": 3.777087926864624, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1371.0, + "completions/max_terminated_length": 1371.0, + "completions/mean_length": 611.90625, + "completions/mean_terminated_length": 611.90625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.4906344410876133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04785553365945816, + "learning_rate": 1.8061299319294694e-06, + "loss": 0.0261, + "num_tokens": 136901262.0, + "reward": 4.419078350067139, + "reward_std": 1.8275611400604248, + "rewards/accuracy_reward/mean": 3.6690783500671387, + "rewards/accuracy_reward/std": 3.647444486618042, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1346.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 747.9375, + "completions/mean_terminated_length": 747.9375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.49123867069486404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040127500891685486, + "learning_rate": 1.8034497929304284e-06, + "loss": -0.0143, + "num_tokens": 137095226.0, + "reward": 3.6759610176086426, + "reward_std": 2.3293776512145996, + "rewards/accuracy_reward/mean": 2.9259610176086426, + "rewards/accuracy_reward/std": 3.5450501441955566, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 1322.0, + "completions/mean_length": 628.84375, + "completions/mean_terminated_length": 628.84375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.4918429003021148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045798495411872864, + "learning_rate": 1.8007690410720266e-06, + "loss": 0.0592, + "num_tokens": 137308720.0, + "reward": 3.7635843753814697, + "reward_std": 1.4756417274475098, + "rewards/accuracy_reward/mean": 3.0135841369628906, + "rewards/accuracy_reward/std": 3.6039628982543945, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 571.765625, + "completions/mean_terminated_length": 571.765625, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.49244712990936557, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04975862428545952, + "learning_rate": 1.7980876870608527e-06, + "loss": 0.0006, + "num_tokens": 137498561.0, + "reward": 3.6332015991210938, + "reward_std": 2.7388031482696533, + "rewards/accuracy_reward/mean": 2.8832015991210938, + "rewards/accuracy_reward/std": 3.5565106868743896, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 744.515625, + "completions/mean_terminated_length": 702.4677124023438, + "completions/min_length": 441.0, + "completions/min_terminated_length": 441.0, + "epoch": 0.4930513595166163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.015759091824293137, + "learning_rate": 1.7954057416059002e-06, + "loss": -0.0208, + "num_tokens": 137739842.0, + "reward": 2.4599406719207764, + "reward_std": 0.7289025783538818, + "rewards/accuracy_reward/mean": 1.7333781719207764, + "rewards/accuracy_reward/std": 3.2006888389587402, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 799.0, + "completions/mean_length": 597.515625, + "completions/mean_terminated_length": 574.4921264648438, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.4936555891238671, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06375595182180405, + "learning_rate": 1.792723215418526e-06, + "loss": -0.0118, + "num_tokens": 137925683.0, + "reward": 2.6618499755859375, + "reward_std": 2.789512872695923, + "rewards/accuracy_reward/mean": 1.935287594795227, + "rewards/accuracy_reward/std": 3.336726188659668, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 473.640625, + "completions/mean_terminated_length": 473.640625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.49425981873111785, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05166256055235863, + "learning_rate": 1.790040119212405e-06, + "loss": 0.0435, + "num_tokens": 138099548.0, + "reward": 5.486888885498047, + "reward_std": 2.7450437545776367, + "rewards/accuracy_reward/mean": 4.736888885498047, + "rewards/accuracy_reward/std": 3.6278786659240723, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1025.0, + "completions/max_terminated_length": 1025.0, + "completions/mean_length": 574.125, + "completions/mean_terminated_length": 574.125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.49486404833836856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03462408855557442, + "learning_rate": 1.7873564637034892e-06, + "loss": 0.0121, + "num_tokens": 138336404.0, + "reward": 3.462707042694092, + "reward_std": 1.6031231880187988, + "rewards/accuracy_reward/mean": 2.712707042694092, + "rewards/accuracy_reward/std": 3.644179344177246, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 548.671875, + "completions/mean_terminated_length": 548.671875, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.4954682779456193, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046154141426086426, + "learning_rate": 1.7846722596099653e-06, + "loss": 0.002, + "num_tokens": 138524287.0, + "reward": 4.92900276184082, + "reward_std": 2.69381046295166, + "rewards/accuracy_reward/mean": 4.17900276184082, + "rewards/accuracy_reward/std": 3.754295587539673, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 526.953125, + "completions/mean_terminated_length": 526.953125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.4960725075528701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058082036674022675, + "learning_rate": 1.7819875176522096e-06, + "loss": 0.0208, + "num_tokens": 138658172.0, + "reward": 5.2847700119018555, + "reward_std": 2.8578004837036133, + "rewards/accuracy_reward/mean": 4.534770488739014, + "rewards/accuracy_reward/std": 3.6595375537872314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.0, + "completions/max_terminated_length": 833.0, + "completions/mean_length": 527.09375, + "completions/mean_terminated_length": 527.09375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.49667673716012084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053963784128427505, + "learning_rate": 1.779302248552747e-06, + "loss": -0.019, + "num_tokens": 138886018.0, + "reward": 4.818668842315674, + "reward_std": 2.3968000411987305, + "rewards/accuracy_reward/mean": 4.068668365478516, + "rewards/accuracy_reward/std": 3.7506179809570312, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 545.171875, + "completions/mean_terminated_length": 545.171875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.4972809667673716, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0387086495757103, + "learning_rate": 1.7766164630362079e-06, + "loss": 0.0063, + "num_tokens": 139071997.0, + "reward": 3.287935972213745, + "reward_std": 2.06200909614563, + "rewards/accuracy_reward/mean": 2.537935972213745, + "rewards/accuracy_reward/std": 3.5697507858276367, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 459.640625, + "completions/mean_terminated_length": 459.640625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.49788519637462236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047514963895082474, + "learning_rate": 1.7739301718292848e-06, + "loss": 0.0038, + "num_tokens": 139215606.0, + "reward": 5.631655693054199, + "reward_std": 2.9677391052246094, + "rewards/accuracy_reward/mean": 4.881655216217041, + "rewards/accuracy_reward/std": 3.535134792327881, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 597.296875, + "completions/mean_terminated_length": 597.296875, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.4984894259818731, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04443963244557381, + "learning_rate": 1.7712433856606916e-06, + "loss": 0.0101, + "num_tokens": 139451193.0, + "reward": 3.997701406478882, + "reward_std": 2.3400487899780273, + "rewards/accuracy_reward/mean": 3.247701644897461, + "rewards/accuracy_reward/std": 3.7455523014068604, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 534.15625, + "completions/mean_terminated_length": 534.15625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.4990936555891239, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023805715143680573, + "learning_rate": 1.7685561152611155e-06, + "loss": 0.0028, + "num_tokens": 139643635.0, + "reward": 2.3505640029907227, + "reward_std": 0.8648081421852112, + "rewards/accuracy_reward/mean": 1.6005640029907227, + "rewards/accuracy_reward/std": 2.9985759258270264, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 451.59375, + "completions/mean_terminated_length": 451.59375, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.49969788519637465, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04527535289525986, + "learning_rate": 1.7658683713631817e-06, + "loss": -0.0253, + "num_tokens": 139805129.0, + "reward": 4.748259544372559, + "reward_std": 1.736411690711975, + "rewards/accuracy_reward/mean": 3.9982590675354004, + "rewards/accuracy_reward/std": 3.807875871658325, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 530.515625, + "completions/mean_terminated_length": 530.515625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.5003021148036254, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024954557418823242, + "learning_rate": 1.7631801647014034e-06, + "loss": 0.0031, + "num_tokens": 140027066.0, + "reward": 5.980739116668701, + "reward_std": 1.0682170391082764, + "rewards/accuracy_reward/mean": 5.230739116668701, + "rewards/accuracy_reward/std": 3.3996894359588623, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 578.15625, + "completions/mean_terminated_length": 578.15625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.5009063444108761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05411255732178688, + "learning_rate": 1.7604915060121435e-06, + "loss": 0.0077, + "num_tokens": 140181732.0, + "reward": 5.634032726287842, + "reward_std": 3.1226797103881836, + "rewards/accuracy_reward/mean": 4.884032726287842, + "rewards/accuracy_reward/std": 3.562819004058838, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 608.015625, + "completions/mean_terminated_length": 608.015625, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.5015105740181269, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04782368987798691, + "learning_rate": 1.7578024060335706e-06, + "loss": -0.0029, + "num_tokens": 140349605.0, + "reward": 6.188547134399414, + "reward_std": 2.638317823410034, + "rewards/accuracy_reward/mean": 5.438547134399414, + "rewards/accuracy_reward/std": 3.261870861053467, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 587.171875, + "completions/mean_terminated_length": 563.984130859375, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.5021148036253776, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03631550073623657, + "learning_rate": 1.755112875505614e-06, + "loss": -0.0149, + "num_tokens": 140498288.0, + "reward": 2.7037765979766846, + "reward_std": 1.408288598060608, + "rewards/accuracy_reward/mean": 1.9654953479766846, + "rewards/accuracy_reward/std": 3.3319802284240723, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1149.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 625.03125, + "completions/mean_terminated_length": 625.03125, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.5027190332326285, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.015825651586055756, + "learning_rate": 1.7524229251699245e-06, + "loss": 0.0002, + "num_tokens": 140635970.0, + "reward": 4.708549499511719, + "reward_std": 0.4808385372161865, + "rewards/accuracy_reward/mean": 3.958549976348877, + "rewards/accuracy_reward/std": 3.6475727558135986, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 512.984375, + "completions/mean_terminated_length": 512.984375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.5033232628398792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02685452066361904, + "learning_rate": 1.749732565769828e-06, + "loss": 0.0028, + "num_tokens": 140819169.0, + "reward": 4.662587642669678, + "reward_std": 1.0731326341629028, + "rewards/accuracy_reward/mean": 3.9125876426696777, + "rewards/accuracy_reward/std": 3.6664955615997314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 707.640625, + "completions/mean_terminated_length": 707.640625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.5039274924471299, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026020193472504616, + "learning_rate": 1.7470418080502856e-06, + "loss": -0.0182, + "num_tokens": 140979450.0, + "reward": 2.9474422931671143, + "reward_std": 0.8224215507507324, + "rewards/accuracy_reward/mean": 2.1974422931671143, + "rewards/accuracy_reward/std": 3.4452950954437256, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 628.140625, + "completions/mean_terminated_length": 628.140625, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.5045317220543807, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049275387078523636, + "learning_rate": 1.7443506627578482e-06, + "loss": 0.002, + "num_tokens": 141145043.0, + "reward": 5.359977722167969, + "reward_std": 2.3530564308166504, + "rewards/accuracy_reward/mean": 4.609978199005127, + "rewards/accuracy_reward/std": 3.8425209522247314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 517.0625, + "completions/mean_terminated_length": 517.0625, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.5051359516616314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03496721759438515, + "learning_rate": 1.7416591406406144e-06, + "loss": -0.0046, + "num_tokens": 141311495.0, + "reward": 7.287557125091553, + "reward_std": 1.6245462894439697, + "rewards/accuracy_reward/mean": 6.537557601928711, + "rewards/accuracy_reward/std": 2.4057857990264893, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 688.609375, + "completions/mean_terminated_length": 688.609375, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.5057401812688822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04174799472093582, + "learning_rate": 1.7389672524481895e-06, + "loss": 0.0048, + "num_tokens": 141473454.0, + "reward": 1.8737328052520752, + "reward_std": 1.6543887853622437, + "rewards/accuracy_reward/mean": 1.1237328052520752, + "rewards/accuracy_reward/std": 2.429069757461548, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 530.1875, + "completions/mean_terminated_length": 530.1875, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.5063444108761329, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03153300657868385, + "learning_rate": 1.7362750089316386e-06, + "loss": 0.0062, + "num_tokens": 141658122.0, + "reward": 4.511375427246094, + "reward_std": 1.3335294723510742, + "rewards/accuracy_reward/mean": 3.7613749504089355, + "rewards/accuracy_reward/std": 3.7221970558166504, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1209.0, + "completions/max_terminated_length": 1209.0, + "completions/mean_length": 481.921875, + "completions/mean_terminated_length": 481.921875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.5069486404833837, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034892696887254715, + "learning_rate": 1.7335824208434468e-06, + "loss": -0.0072, + "num_tokens": 141793941.0, + "reward": 5.624143600463867, + "reward_std": 0.9431484937667847, + "rewards/accuracy_reward/mean": 4.874143600463867, + "rewards/accuracy_reward/std": 3.5559353828430176, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 574.828125, + "completions/mean_terminated_length": 551.4444580078125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.5075528700906344, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04559945687651634, + "learning_rate": 1.7308894989374766e-06, + "loss": 0.0091, + "num_tokens": 142049482.0, + "reward": 1.9998688697814941, + "reward_std": 1.7911231517791748, + "rewards/accuracy_reward/mean": 1.2615875005722046, + "rewards/accuracy_reward/std": 2.821880578994751, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 652.328125, + "completions/mean_terminated_length": 652.328125, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.5081570996978853, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.018246199935674667, + "learning_rate": 1.7281962539689226e-06, + "loss": -0.0002, + "num_tokens": 142298975.0, + "reward": 2.468618869781494, + "reward_std": 0.5916566848754883, + "rewards/accuracy_reward/mean": 1.7186188697814941, + "rewards/accuracy_reward/std": 3.209890604019165, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 465.859375, + "completions/mean_terminated_length": 465.859375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.508761329305136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038035713136196136, + "learning_rate": 1.7255026966942694e-06, + "loss": 0.0144, + "num_tokens": 142477206.0, + "reward": 6.970606327056885, + "reward_std": 1.5315990447998047, + "rewards/accuracy_reward/mean": 6.220606803894043, + "rewards/accuracy_reward/std": 2.7452027797698975, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 875.125, + "completions/mean_terminated_length": 856.5079956054688, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.5093655589123867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04518472030758858, + "learning_rate": 1.7228088378712486e-06, + "loss": 0.0087, + "num_tokens": 142638190.0, + "reward": 4.16067361831665, + "reward_std": 1.0165960788726807, + "rewards/accuracy_reward/mean": 3.4223923683166504, + "rewards/accuracy_reward/std": 3.5977542400360107, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 553.359375, + "completions/mean_terminated_length": 553.359375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.5099697885196375, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.013598470017313957, + "learning_rate": 1.720114688258798e-06, + "loss": -0.004, + "num_tokens": 142874949.0, + "reward": 2.523698568344116, + "reward_std": 0.5337299108505249, + "rewards/accuracy_reward/mean": 1.7736984491348267, + "rewards/accuracy_reward/std": 3.1741695404052734, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 503.59375, + "completions/mean_terminated_length": 503.59375, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.5105740181268882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025693925097584724, + "learning_rate": 1.7174202586170153e-06, + "loss": 0.0031, + "num_tokens": 142993819.0, + "reward": 7.881287574768066, + "reward_std": 1.0035114288330078, + "rewards/accuracy_reward/mean": 7.131287574768066, + "rewards/accuracy_reward/std": 1.3869054317474365, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 451.078125, + "completions/mean_terminated_length": 451.078125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.511178247734139, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04254954308271408, + "learning_rate": 1.7147255597071162e-06, + "loss": -0.0208, + "num_tokens": 143168192.0, + "reward": 5.7121076583862305, + "reward_std": 2.0878541469573975, + "rewards/accuracy_reward/mean": 4.9621076583862305, + "rewards/accuracy_reward/std": 3.5157644748687744, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 527.359375, + "completions/mean_terminated_length": 527.359375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.5117824773413897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034504685550928116, + "learning_rate": 1.712030602291393e-06, + "loss": -0.0002, + "num_tokens": 143286535.0, + "reward": 3.321662425994873, + "reward_std": 1.763701319694519, + "rewards/accuracy_reward/mean": 2.571662425994873, + "rewards/accuracy_reward/std": 3.552624225616455, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 442.25, + "completions/mean_terminated_length": 442.25, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.5123867069486405, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03870110586285591, + "learning_rate": 1.7093353971331706e-06, + "loss": 0.0263, + "num_tokens": 143433703.0, + "reward": 7.313848495483398, + "reward_std": 1.96487557888031, + "rewards/accuracy_reward/mean": 6.563848495483398, + "rewards/accuracy_reward/std": 2.3727502822875977, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 985.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 559.78125, + "completions/mean_terminated_length": 559.78125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.5129909365558912, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03238805755972862, + "learning_rate": 1.7066399549967617e-06, + "loss": -0.0109, + "num_tokens": 143589577.0, + "reward": 4.509550094604492, + "reward_std": 1.8597521781921387, + "rewards/accuracy_reward/mean": 3.759549856185913, + "rewards/accuracy_reward/std": 3.6468493938446045, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 550.15625, + "completions/mean_terminated_length": 550.15625, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.513595166163142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03167205676436424, + "learning_rate": 1.703944286647427e-06, + "loss": -0.0032, + "num_tokens": 143778243.0, + "reward": 6.3336358070373535, + "reward_std": 1.2905892133712769, + "rewards/accuracy_reward/mean": 5.5836358070373535, + "rewards/accuracy_reward/std": 3.249335289001465, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1380.0, + "completions/max_terminated_length": 1380.0, + "completions/mean_length": 668.796875, + "completions/mean_terminated_length": 668.796875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.5141993957703928, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0413055382668972, + "learning_rate": 1.7012484028513299e-06, + "loss": 0.0096, + "num_tokens": 143919158.0, + "reward": 2.0304718017578125, + "reward_std": 1.7817490100860596, + "rewards/accuracy_reward/mean": 1.2804718017578125, + "rewards/accuracy_reward/std": 2.8243794441223145, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 696.796875, + "completions/mean_terminated_length": 696.796875, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.5148036253776435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0016037452733144164, + "learning_rate": 1.6985523143754952e-06, + "loss": -0.0006, + "num_tokens": 144080297.0, + "reward": 6.356594085693359, + "reward_std": 0.06347799301147461, + "rewards/accuracy_reward/mean": 5.606594085693359, + "rewards/accuracy_reward/std": 3.2287955284118652, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 528.328125, + "completions/mean_terminated_length": 528.328125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.5154078549848943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02845207415521145, + "learning_rate": 1.6958560319877634e-06, + "loss": 0.004, + "num_tokens": 144213150.0, + "reward": 2.9862451553344727, + "reward_std": 1.1719651222229004, + "rewards/accuracy_reward/mean": 2.2362453937530518, + "rewards/accuracy_reward/std": 3.3940279483795166, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 463.84375, + "completions/mean_terminated_length": 463.84375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.516012084592145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04698057845234871, + "learning_rate": 1.6931595664567509e-06, + "loss": 0.0093, + "num_tokens": 144506660.0, + "reward": 7.476476669311523, + "reward_std": 1.8497748374938965, + "rewards/accuracy_reward/mean": 6.726476192474365, + "rewards/accuracy_reward/std": 2.1810173988342285, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 662.0625, + "completions/mean_terminated_length": 662.0625, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.5166163141993958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039677780121564865, + "learning_rate": 1.690462928551806e-06, + "loss": -0.005, + "num_tokens": 144678296.0, + "reward": 4.078343868255615, + "reward_std": 1.7702250480651855, + "rewards/accuracy_reward/mean": 3.3283438682556152, + "rewards/accuracy_reward/std": 3.6611006259918213, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1149.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 648.90625, + "completions/mean_terminated_length": 648.90625, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.5172205438066465, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04365631937980652, + "learning_rate": 1.6877661290429632e-06, + "loss": 0.0043, + "num_tokens": 144832370.0, + "reward": 3.239060878753662, + "reward_std": 1.7840477228164673, + "rewards/accuracy_reward/mean": 2.489060878753662, + "rewards/accuracy_reward/std": 3.641780376434326, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1528.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 583.21875, + "completions/mean_terminated_length": 583.21875, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.5178247734138973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040012795478105545, + "learning_rate": 1.6850691787009058e-06, + "loss": 0.0002, + "num_tokens": 145065328.0, + "reward": 3.1592841148376465, + "reward_std": 1.8437130451202393, + "rewards/accuracy_reward/mean": 2.4092845916748047, + "rewards/accuracy_reward/std": 3.427712917327881, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1126.0, + "completions/max_terminated_length": 1126.0, + "completions/mean_length": 718.34375, + "completions/mean_terminated_length": 718.34375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.518429003021148, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04710202291607857, + "learning_rate": 1.6823720882969155e-06, + "loss": 0.0007, + "num_tokens": 145219030.0, + "reward": 2.7495241165161133, + "reward_std": 2.3457837104797363, + "rewards/accuracy_reward/mean": 2.0034303665161133, + "rewards/accuracy_reward/std": 3.3031015396118164, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 463.46875, + "completions/mean_terminated_length": 463.46875, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.5190332326283988, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024237893521785736, + "learning_rate": 1.6796748686028368e-06, + "loss": 0.0123, + "num_tokens": 145360948.0, + "reward": 6.205123424530029, + "reward_std": 0.5417225360870361, + "rewards/accuracy_reward/mean": 5.455123424530029, + "rewards/accuracy_reward/std": 3.3447329998016357, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 549.28125, + "completions/mean_terminated_length": 549.28125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.5196374622356495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03943613916635513, + "learning_rate": 1.6769775303910283e-06, + "loss": 0.0157, + "num_tokens": 145532790.0, + "reward": 1.8616328239440918, + "reward_std": 2.2085671424865723, + "rewards/accuracy_reward/mean": 1.1116328239440918, + "rewards/accuracy_reward/std": 2.765533924102783, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 582.515625, + "completions/mean_terminated_length": 582.515625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.5202416918429003, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04564009979367256, + "learning_rate": 1.6742800844343242e-06, + "loss": 0.0112, + "num_tokens": 145697143.0, + "reward": 1.759553074836731, + "reward_std": 1.8742945194244385, + "rewards/accuracy_reward/mean": 1.009553074836731, + "rewards/accuracy_reward/std": 2.9741413593292236, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 597.4375, + "completions/mean_terminated_length": 597.4375, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.5208459214501511, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026684045791625977, + "learning_rate": 1.671582541505987e-06, + "loss": 0.0158, + "num_tokens": 145869907.0, + "reward": 5.739973068237305, + "reward_std": 0.8967592120170593, + "rewards/accuracy_reward/mean": 4.989973545074463, + "rewards/accuracy_reward/std": 3.5149571895599365, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1311.0, + "completions/max_terminated_length": 1311.0, + "completions/mean_length": 567.65625, + "completions/mean_terminated_length": 567.65625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.5214501510574018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0382399745285511, + "learning_rate": 1.6688849123796663e-06, + "loss": 0.0036, + "num_tokens": 146036381.0, + "reward": 4.380373954772949, + "reward_std": 1.6896178722381592, + "rewards/accuracy_reward/mean": 3.630373954772949, + "rewards/accuracy_reward/std": 3.7962512969970703, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 483.453125, + "completions/mean_terminated_length": 458.61907958984375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.5220543806646526, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032736584544181824, + "learning_rate": 1.6661872078293582e-06, + "loss": -0.0355, + "num_tokens": 146186794.0, + "reward": 7.6460981369018555, + "reward_std": 1.3390088081359863, + "rewards/accuracy_reward/mean": 6.9078168869018555, + "rewards/accuracy_reward/std": 1.8676702976226807, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 647.40625, + "completions/mean_terminated_length": 647.40625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.5226586102719033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03623616322875023, + "learning_rate": 1.663489438629358e-06, + "loss": -0.006, + "num_tokens": 146367268.0, + "reward": 3.854825019836426, + "reward_std": 1.4536099433898926, + "rewards/accuracy_reward/mean": 3.104825019836426, + "rewards/accuracy_reward/std": 3.616074562072754, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 521.90625, + "completions/mean_terminated_length": 521.90625, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 0.5232628398791541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03803026303648949, + "learning_rate": 1.6607916155542196e-06, + "loss": 0.0086, + "num_tokens": 146511534.0, + "reward": 5.023130893707275, + "reward_std": 1.4082965850830078, + "rewards/accuracy_reward/mean": 4.280943870544434, + "rewards/accuracy_reward/std": 3.6726346015930176, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.0625, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 689.21875, + "completions/mean_terminated_length": 667.6508178710938, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.5238670694864048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02189544588327408, + "learning_rate": 1.658093749378713e-06, + "loss": -0.0112, + "num_tokens": 146628572.0, + "reward": 2.842754602432251, + "reward_std": 0.6338731050491333, + "rewards/accuracy_reward/mean": 2.116192102432251, + "rewards/accuracy_reward/std": 3.2384440898895264, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 657.953125, + "completions/mean_terminated_length": 657.953125, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.5244712990936556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038908783346414566, + "learning_rate": 1.6553958508777794e-06, + "loss": -0.0032, + "num_tokens": 146805401.0, + "reward": 4.603206157684326, + "reward_std": 2.0760624408721924, + "rewards/accuracy_reward/mean": 3.8532063961029053, + "rewards/accuracy_reward/std": 3.635423183441162, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1120.0, + "completions/mean_length": 748.40625, + "completions/mean_terminated_length": 727.77783203125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.5250755287009063, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033649176359176636, + "learning_rate": 1.65269793082649e-06, + "loss": -0.014, + "num_tokens": 147009235.0, + "reward": 2.048217296600342, + "reward_std": 1.0129036903381348, + "rewards/accuracy_reward/mean": 1.3099359273910522, + "rewards/accuracy_reward/std": 2.919865846633911, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 543.4375, + "completions/mean_terminated_length": 543.4375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.525679758308157, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.017341094091534615, + "learning_rate": 1.65e-06, + "loss": -0.0135, + "num_tokens": 147116127.0, + "reward": 2.6506500244140625, + "reward_std": 0.5891428589820862, + "rewards/accuracy_reward/mean": 1.9006500244140625, + "rewards/accuracy_reward/std": 3.3741226196289062, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1165.0, + "completions/max_terminated_length": 1165.0, + "completions/mean_length": 637.6875, + "completions/mean_terminated_length": 637.6875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.5262839879154079, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037698544561862946, + "learning_rate": 1.6473020691735103e-06, + "loss": -0.0114, + "num_tokens": 147340651.0, + "reward": 5.702300071716309, + "reward_std": 1.4234577417373657, + "rewards/accuracy_reward/mean": 4.952300071716309, + "rewards/accuracy_reward/std": 3.5948429107666016, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 626.03125, + "completions/mean_terminated_length": 626.03125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.5268882175226586, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02536127157509327, + "learning_rate": 1.644604149122221e-06, + "loss": -0.0053, + "num_tokens": 147524989.0, + "reward": 6.01171875, + "reward_std": 1.1579617261886597, + "rewards/accuracy_reward/mean": 5.26171875, + "rewards/accuracy_reward/std": 3.400789499282837, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 578.796875, + "completions/mean_terminated_length": 555.4761962890625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.5274924471299094, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042886342853307724, + "learning_rate": 1.6419062506212874e-06, + "loss": -0.0121, + "num_tokens": 147651360.0, + "reward": 3.128429651260376, + "reward_std": 2.0535449981689453, + "rewards/accuracy_reward/mean": 2.390148639678955, + "rewards/accuracy_reward/std": 3.554161310195923, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 517.5, + "completions/mean_terminated_length": 517.5, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.5280966767371601, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0220645684748888, + "learning_rate": 1.6392083844457808e-06, + "loss": -0.0044, + "num_tokens": 147821744.0, + "reward": 6.923001289367676, + "reward_std": 0.911063015460968, + "rewards/accuracy_reward/mean": 6.173001766204834, + "rewards/accuracy_reward/std": 2.8346054553985596, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 682.0, + "completions/mean_terminated_length": 682.0, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.5287009063444109, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02571466565132141, + "learning_rate": 1.6365105613706428e-06, + "loss": -0.0016, + "num_tokens": 148035776.0, + "reward": 6.141709327697754, + "reward_std": 0.7030766606330872, + "rewards/accuracy_reward/mean": 5.391709327697754, + "rewards/accuracy_reward/std": 3.3136260509490967, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 521.640625, + "completions/mean_terminated_length": 521.640625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.5293051359516616, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03863590583205223, + "learning_rate": 1.6338127921706424e-06, + "loss": 0.0092, + "num_tokens": 148186409.0, + "reward": 5.049249649047852, + "reward_std": 1.9119211435317993, + "rewards/accuracy_reward/mean": 4.299249649047852, + "rewards/accuracy_reward/std": 3.7218968868255615, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 518.125, + "completions/mean_terminated_length": 518.125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.5299093655589124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003994614817202091, + "learning_rate": 1.6311150876203336e-06, + "loss": 0.0011, + "num_tokens": 148333937.0, + "reward": 4.430294036865234, + "reward_std": 0.16031594574451447, + "rewards/accuracy_reward/mean": 3.6802937984466553, + "rewards/accuracy_reward/std": 3.809589385986328, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 543.71875, + "completions/mean_terminated_length": 543.71875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.5305135951661631, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0217463169246912, + "learning_rate": 1.6284174584940133e-06, + "loss": 0.001, + "num_tokens": 148488543.0, + "reward": 3.0735654830932617, + "reward_std": 0.8431771993637085, + "rewards/accuracy_reward/mean": 2.323565721511841, + "rewards/accuracy_reward/std": 3.4737017154693604, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 503.328125, + "completions/mean_terminated_length": 503.328125, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.5311178247734138, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0439550057053566, + "learning_rate": 1.6257199155656758e-06, + "loss": -0.0145, + "num_tokens": 148617012.0, + "reward": 2.596442222595215, + "reward_std": 1.8853274583816528, + "rewards/accuracy_reward/mean": 1.8464422225952148, + "rewards/accuracy_reward/std": 3.2621970176696777, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 641.8125, + "completions/mean_terminated_length": 641.8125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.5317220543806647, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03836005553603172, + "learning_rate": 1.6230224696089712e-06, + "loss": 0.0112, + "num_tokens": 148803384.0, + "reward": 1.6092890501022339, + "reward_std": 1.9915056228637695, + "rewards/accuracy_reward/mean": 0.8592890501022339, + "rewards/accuracy_reward/std": 2.3429436683654785, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 549.515625, + "completions/mean_terminated_length": 549.515625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.5323262839879154, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03287895768880844, + "learning_rate": 1.6203251313971633e-06, + "loss": 0.0209, + "num_tokens": 148949737.0, + "reward": 5.163153648376465, + "reward_std": 0.9525147676467896, + "rewards/accuracy_reward/mean": 4.413153171539307, + "rewards/accuracy_reward/std": 3.679412603378296, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 546.6875, + "completions/mean_terminated_length": 546.6875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.5329305135951662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03632785379886627, + "learning_rate": 1.6176279117030849e-06, + "loss": 0.0136, + "num_tokens": 149121525.0, + "reward": 5.764715194702148, + "reward_std": 1.7080268859863281, + "rewards/accuracy_reward/mean": 5.018621921539307, + "rewards/accuracy_reward/std": 3.5024683475494385, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 506.265625, + "completions/mean_terminated_length": 506.265625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.5335347432024169, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03894397243857384, + "learning_rate": 1.6149308212990946e-06, + "loss": 0.0032, + "num_tokens": 149313014.0, + "reward": 4.900918960571289, + "reward_std": 1.8573777675628662, + "rewards/accuracy_reward/mean": 4.150918960571289, + "rewards/accuracy_reward/std": 3.7440602779388428, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 529.734375, + "completions/mean_terminated_length": 529.734375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.5341389728096677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.022169968113303185, + "learning_rate": 1.6122338709570372e-06, + "loss": 0.0057, + "num_tokens": 149425109.0, + "reward": 7.641101837158203, + "reward_std": 0.8599704504013062, + "rewards/accuracy_reward/mean": 6.891101837158203, + "rewards/accuracy_reward/std": 1.8829795122146606, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1313.0, + "completions/max_terminated_length": 1313.0, + "completions/mean_length": 662.078125, + "completions/mean_terminated_length": 662.078125, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.5347432024169184, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.057337261736392975, + "learning_rate": 1.6095370714481945e-06, + "loss": 0.036, + "num_tokens": 149624010.0, + "reward": 3.434378147125244, + "reward_std": 2.440277576446533, + "rewards/accuracy_reward/mean": 2.684378147125244, + "rewards/accuracy_reward/std": 3.5864741802215576, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1123.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 675.640625, + "completions/mean_terminated_length": 675.640625, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.5353474320241692, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030499495565891266, + "learning_rate": 1.6068404335432495e-06, + "loss": 0.0063, + "num_tokens": 149785203.0, + "reward": 3.8820297718048096, + "reward_std": 1.3634750843048096, + "rewards/accuracy_reward/mean": 3.1320297718048096, + "rewards/accuracy_reward/std": 3.7294154167175293, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 514.140625, + "completions/mean_terminated_length": 514.140625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.5359516616314199, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02570156939327717, + "learning_rate": 1.6041439680122376e-06, + "loss": 0.0107, + "num_tokens": 149938540.0, + "reward": 6.673039436340332, + "reward_std": 0.7844871282577515, + "rewards/accuracy_reward/mean": 5.923039436340332, + "rewards/accuracy_reward/std": 3.055713415145874, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 608.546875, + "completions/mean_terminated_length": 608.546875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.5365558912386706, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05210784077644348, + "learning_rate": 1.6014476856245056e-06, + "loss": 0.021, + "num_tokens": 150104655.0, + "reward": 1.6805999279022217, + "reward_std": 1.6411439180374146, + "rewards/accuracy_reward/mean": 0.9306000471115112, + "rewards/accuracy_reward/std": 2.4816086292266846, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 525.234375, + "completions/mean_terminated_length": 525.234375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.5371601208459215, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033971454948186874, + "learning_rate": 1.5987515971486707e-06, + "loss": 0.0023, + "num_tokens": 150247662.0, + "reward": 3.51466703414917, + "reward_std": 1.6538512706756592, + "rewards/accuracy_reward/mean": 2.764667272567749, + "rewards/accuracy_reward/std": 3.6665940284729004, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 594.515625, + "completions/mean_terminated_length": 594.515625, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.5377643504531722, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.052132949233055115, + "learning_rate": 1.5960557133525739e-06, + "loss": 0.0113, + "num_tokens": 150454831.0, + "reward": 2.9131951332092285, + "reward_std": 2.5810627937316895, + "rewards/accuracy_reward/mean": 2.1631951332092285, + "rewards/accuracy_reward/std": 3.3361752033233643, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 528.515625, + "completions/mean_terminated_length": 528.515625, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.538368580060423, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.048821836709976196, + "learning_rate": 1.5933600450032387e-06, + "loss": 0.0011, + "num_tokens": 150644528.0, + "reward": 3.647606134414673, + "reward_std": 1.4195630550384521, + "rewards/accuracy_reward/mean": 2.897606134414673, + "rewards/accuracy_reward/std": 3.647953510284424, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1374.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 629.09375, + "completions/mean_terminated_length": 629.09375, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.5389728096676737, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04453074932098389, + "learning_rate": 1.5906646028668298e-06, + "loss": 0.0312, + "num_tokens": 150802486.0, + "reward": 5.076516628265381, + "reward_std": 2.253300189971924, + "rewards/accuracy_reward/mean": 4.326517105102539, + "rewards/accuracy_reward/std": 3.6248586177825928, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 610.375, + "completions/mean_terminated_length": 587.5556030273438, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.5395770392749245, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03246114403009415, + "learning_rate": 1.5879693977086067e-06, + "loss": -0.0279, + "num_tokens": 150948190.0, + "reward": 6.364284515380859, + "reward_std": 1.5476276874542236, + "rewards/accuracy_reward/mean": 5.626003265380859, + "rewards/accuracy_reward/std": 3.161660671234131, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 544.46875, + "completions/mean_terminated_length": 544.46875, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.5401812688821752, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.039793696254491806, + "learning_rate": 1.5852744402928842e-06, + "loss": -0.012, + "num_tokens": 151097900.0, + "reward": 2.8870673179626465, + "reward_std": 1.4741007089614868, + "rewards/accuracy_reward/mean": 2.1370673179626465, + "rewards/accuracy_reward/std": 3.3462183475494385, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 433.3125, + "completions/mean_terminated_length": 433.3125, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.540785498489426, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030474156141281128, + "learning_rate": 1.582579741382985e-06, + "loss": 0.0233, + "num_tokens": 151253952.0, + "reward": 5.736935615539551, + "reward_std": 1.2981747388839722, + "rewards/accuracy_reward/mean": 4.986936092376709, + "rewards/accuracy_reward/std": 3.5128557682037354, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 529.3125, + "completions/mean_terminated_length": 529.3125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.5413897280966767, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03332355618476868, + "learning_rate": 1.5798853117412024e-06, + "loss": -0.0004, + "num_tokens": 151484996.0, + "reward": 4.297486305236816, + "reward_std": 1.4752352237701416, + "rewards/accuracy_reward/mean": 3.5474860668182373, + "rewards/accuracy_reward/std": 3.756478786468506, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 592.390625, + "completions/mean_terminated_length": 592.390625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.5419939577039274, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03476888686418533, + "learning_rate": 1.5771911621287518e-06, + "loss": -0.02, + "num_tokens": 151675293.0, + "reward": 2.9469170570373535, + "reward_std": 1.5409770011901855, + "rewards/accuracy_reward/mean": 2.1969170570373535, + "rewards/accuracy_reward/std": 3.3960118293762207, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 551.40625, + "completions/mean_terminated_length": 551.40625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.5425981873111783, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04944761097431183, + "learning_rate": 1.574497303305731e-06, + "loss": 0.0268, + "num_tokens": 151838215.0, + "reward": 4.535370826721191, + "reward_std": 2.386733293533325, + "rewards/accuracy_reward/mean": 3.7853705883026123, + "rewards/accuracy_reward/std": 3.7983217239379883, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 476.421875, + "completions/mean_terminated_length": 476.421875, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.543202416918429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.025151243433356285, + "learning_rate": 1.5718037460310778e-06, + "loss": 0.009, + "num_tokens": 152019858.0, + "reward": 4.106771945953369, + "reward_std": 1.1908596754074097, + "rewards/accuracy_reward/mean": 3.356771945953369, + "rewards/accuracy_reward/std": 3.7841341495513916, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 1289.0, + "completions/mean_length": 724.421875, + "completions/mean_terminated_length": 724.421875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.5438066465256798, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05040884390473366, + "learning_rate": 1.5691105010625233e-06, + "loss": -0.0133, + "num_tokens": 152191997.0, + "reward": 2.5659687519073486, + "reward_std": 1.9464054107666016, + "rewards/accuracy_reward/mean": 1.8159687519073486, + "rewards/accuracy_reward/std": 3.1301956176757812, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1288.0, + "completions/max_terminated_length": 1288.0, + "completions/mean_length": 624.203125, + "completions/mean_terminated_length": 624.203125, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.5444108761329305, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024074215441942215, + "learning_rate": 1.5664175791565532e-06, + "loss": 0.0029, + "num_tokens": 152334698.0, + "reward": 2.7729170322418213, + "reward_std": 1.16175377368927, + "rewards/accuracy_reward/mean": 2.0229170322418213, + "rewards/accuracy_reward/std": 3.2931406497955322, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 615.4375, + "completions/mean_terminated_length": 615.4375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.5450151057401813, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041746292263269424, + "learning_rate": 1.563724991068362e-06, + "loss": -0.0133, + "num_tokens": 152513446.0, + "reward": 7.144878387451172, + "reward_std": 1.7391819953918457, + "rewards/accuracy_reward/mean": 6.394878387451172, + "rewards/accuracy_reward/std": 2.6554572582244873, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 490.8125, + "completions/mean_terminated_length": 490.8125, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.545619335347432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04058327525854111, + "learning_rate": 1.5610327475518113e-06, + "loss": -0.0189, + "num_tokens": 152751210.0, + "reward": 3.8347673416137695, + "reward_std": 1.7679816484451294, + "rewards/accuracy_reward/mean": 3.0847673416137695, + "rewards/accuracy_reward/std": 3.633641242980957, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 504.078125, + "completions/mean_terminated_length": 504.078125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.5462235649546828, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.019332535564899445, + "learning_rate": 1.5583408593593856e-06, + "loss": 0.0108, + "num_tokens": 152996687.0, + "reward": 6.19821834564209, + "reward_std": 0.4628986418247223, + "rewards/accuracy_reward/mean": 5.448218822479248, + "rewards/accuracy_reward/std": 3.303351640701294, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 584.8125, + "completions/mean_terminated_length": 584.8125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.5468277945619335, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01419343426823616, + "learning_rate": 1.555649337242152e-06, + "loss": -0.0042, + "num_tokens": 153158563.0, + "reward": 4.357948303222656, + "reward_std": 0.47591742873191833, + "rewards/accuracy_reward/mean": 3.6079485416412354, + "rewards/accuracy_reward/std": 3.722235679626465, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 533.09375, + "completions/mean_terminated_length": 533.09375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.5474320241691842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03462842106819153, + "learning_rate": 1.5529581919497144e-06, + "loss": -0.0025, + "num_tokens": 153391049.0, + "reward": 5.973348617553711, + "reward_std": 1.3083610534667969, + "rewards/accuracy_reward/mean": 5.223348617553711, + "rewards/accuracy_reward/std": 3.4059250354766846, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 517.859375, + "completions/mean_terminated_length": 517.859375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.5480362537764351, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0542411282658577, + "learning_rate": 1.5502674342301721e-06, + "loss": 0.0205, + "num_tokens": 153570016.0, + "reward": 5.359459400177002, + "reward_std": 2.581393241882324, + "rewards/accuracy_reward/mean": 4.609459400177002, + "rewards/accuracy_reward/std": 3.5340399742126465, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1149.0, + "completions/mean_length": 645.109375, + "completions/mean_terminated_length": 622.84130859375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.5486404833836858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0518823117017746, + "learning_rate": 1.5475770748300756e-06, + "loss": -0.0136, + "num_tokens": 153703879.0, + "reward": 3.2100234031677246, + "reward_std": 2.7041268348693848, + "rewards/accuracy_reward/mean": 2.4717421531677246, + "rewards/accuracy_reward/std": 3.526292324066162, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 688.578125, + "completions/mean_terminated_length": 688.578125, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.5492447129909366, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05462884157896042, + "learning_rate": 1.5448871244943861e-06, + "loss": -0.023, + "num_tokens": 153871708.0, + "reward": 3.7933967113494873, + "reward_std": 2.464940309524536, + "rewards/accuracy_reward/mean": 3.0433969497680664, + "rewards/accuracy_reward/std": 3.6881368160247803, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 549.09375, + "completions/mean_terminated_length": 525.3016357421875, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.5498489425981873, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03984688222408295, + "learning_rate": 1.5421975939664297e-06, + "loss": -0.0501, + "num_tokens": 154027986.0, + "reward": 5.127163887023926, + "reward_std": 1.8695220947265625, + "rewards/accuracy_reward/mean": 4.392788887023926, + "rewards/accuracy_reward/std": 3.730419397354126, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1246.0, + "completions/max_terminated_length": 1246.0, + "completions/mean_length": 640.09375, + "completions/mean_terminated_length": 640.09375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.5504531722054381, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04237554967403412, + "learning_rate": 1.5395084939878567e-06, + "loss": 0.0172, + "num_tokens": 154209176.0, + "reward": 3.5895421504974365, + "reward_std": 2.179551601409912, + "rewards/accuracy_reward/mean": 2.8395421504974365, + "rewards/accuracy_reward/std": 3.733328104019165, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 643.390625, + "completions/mean_terminated_length": 643.390625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.5510574018126888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04578967019915581, + "learning_rate": 1.536819835298597e-06, + "loss": -0.0145, + "num_tokens": 154402577.0, + "reward": 4.10499382019043, + "reward_std": 1.8654956817626953, + "rewards/accuracy_reward/mean": 3.3549938201904297, + "rewards/accuracy_reward/std": 3.7580723762512207, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 548.453125, + "completions/mean_terminated_length": 500.08062744140625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.5516616314199396, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0022560306824743748, + "learning_rate": 1.5341316286368189e-06, + "loss": -0.0115, + "num_tokens": 154555758.0, + "reward": 2.5484886169433594, + "reward_std": 0.1575605720281601, + "rewards/accuracy_reward/mean": 1.8219263553619385, + "rewards/accuracy_reward/std": 3.258065938949585, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 588.4375, + "completions/mean_terminated_length": 588.4375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "epoch": 0.5522658610271903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03765147551894188, + "learning_rate": 1.5314438847388846e-06, + "loss": -0.0123, + "num_tokens": 154708218.0, + "reward": 3.1817264556884766, + "reward_std": 1.4603501558303833, + "rewards/accuracy_reward/mean": 2.4317264556884766, + "rewards/accuracy_reward/std": 3.5429675579071045, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1390.0, + "completions/max_terminated_length": 1390.0, + "completions/mean_length": 654.125, + "completions/mean_terminated_length": 654.125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.552870090634441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04526229202747345, + "learning_rate": 1.5287566143393092e-06, + "loss": 0.0725, + "num_tokens": 154949650.0, + "reward": 3.718353271484375, + "reward_std": 1.8324475288391113, + "rewards/accuracy_reward/mean": 2.968353271484375, + "rewards/accuracy_reward/std": 3.6574149131774902, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 484.953125, + "completions/mean_terminated_length": 484.953125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.5534743202416919, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02966209128499031, + "learning_rate": 1.5260698281707156e-06, + "loss": -0.0104, + "num_tokens": 155135903.0, + "reward": 4.157942295074463, + "reward_std": 1.707033634185791, + "rewards/accuracy_reward/mean": 3.407942533493042, + "rewards/accuracy_reward/std": 3.6364753246307373, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 536.828125, + "completions/mean_terminated_length": 536.828125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.5540785498489426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05352379009127617, + "learning_rate": 1.523383536963793e-06, + "loss": -0.0121, + "num_tokens": 155316004.0, + "reward": 5.724915504455566, + "reward_std": 3.157825469970703, + "rewards/accuracy_reward/mean": 4.974915981292725, + "rewards/accuracy_reward/std": 3.5401132106781006, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1174.0, + "completions/max_terminated_length": 1174.0, + "completions/mean_length": 585.96875, + "completions/mean_terminated_length": 585.96875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.5546827794561934, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041891805827617645, + "learning_rate": 1.5206977514472534e-06, + "loss": -0.0147, + "num_tokens": 155555458.0, + "reward": 3.667188882827759, + "reward_std": 1.7663557529449463, + "rewards/accuracy_reward/mean": 2.917189121246338, + "rewards/accuracy_reward/std": 3.601590633392334, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 443.890625, + "completions/mean_terminated_length": 443.890625, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.5552870090634441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04248017445206642, + "learning_rate": 1.5180124823477908e-06, + "loss": -0.0149, + "num_tokens": 155716043.0, + "reward": 7.134601593017578, + "reward_std": 1.7455748319625854, + "rewards/accuracy_reward/mean": 6.384601593017578, + "rewards/accuracy_reward/std": 2.604982614517212, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 596.609375, + "completions/mean_terminated_length": 596.609375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.5558912386706949, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022788142785429955, + "learning_rate": 1.5153277403900347e-06, + "loss": 0.006, + "num_tokens": 155898530.0, + "reward": 4.192404747009277, + "reward_std": 0.7475414276123047, + "rewards/accuracy_reward/mean": 3.4424047470092773, + "rewards/accuracy_reward/std": 3.7941818237304688, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 486.203125, + "completions/mean_terminated_length": 486.203125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.5564954682779456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0153780123218894, + "learning_rate": 1.512643536296511e-06, + "loss": -0.0014, + "num_tokens": 156058175.0, + "reward": 6.146821975708008, + "reward_std": 0.5090389847755432, + "rewards/accuracy_reward/mean": 5.396821975708008, + "rewards/accuracy_reward/std": 3.2934865951538086, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 517.5, + "completions/mean_terminated_length": 517.5, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.5570996978851964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04348801448941231, + "learning_rate": 1.5099598807875955e-06, + "loss": 0.0006, + "num_tokens": 156187503.0, + "reward": 2.1085047721862793, + "reward_std": 1.8191828727722168, + "rewards/accuracy_reward/mean": 1.3585046529769897, + "rewards/accuracy_reward/std": 2.743656873703003, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 524.25, + "completions/mean_terminated_length": 524.25, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.5577039274924471, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026823947206139565, + "learning_rate": 1.5072767845814744e-06, + "loss": -0.0005, + "num_tokens": 156371375.0, + "reward": 4.862502098083496, + "reward_std": 0.8163702487945557, + "rewards/accuracy_reward/mean": 4.116408348083496, + "rewards/accuracy_reward/std": 3.6575024127960205, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 551.0, + "completions/mean_terminated_length": 551.0, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.5583081570996978, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0378192700445652, + "learning_rate": 1.5045942583941002e-06, + "loss": 0.0096, + "num_tokens": 156628447.0, + "reward": 5.507862567901611, + "reward_std": 2.0172595977783203, + "rewards/accuracy_reward/mean": 4.757862091064453, + "rewards/accuracy_reward/std": 3.5918633937835693, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 772.0, + "completions/max_terminated_length": 772.0, + "completions/mean_length": 541.9375, + "completions/mean_terminated_length": 541.9375, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.5589123867069486, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028629202395677567, + "learning_rate": 1.5019123129391477e-06, + "loss": 0.006, + "num_tokens": 156884299.0, + "reward": 3.8496124744415283, + "reward_std": 0.9331009984016418, + "rewards/accuracy_reward/mean": 3.0996124744415283, + "rewards/accuracy_reward/std": 3.6629157066345215, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 572.484375, + "completions/mean_terminated_length": 572.484375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.5595166163141994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04643767327070236, + "learning_rate": 1.499230958927974e-06, + "loss": -0.0087, + "num_tokens": 157033914.0, + "reward": 5.014410972595215, + "reward_std": 2.207439661026001, + "rewards/accuracy_reward/mean": 4.264410972595215, + "rewards/accuracy_reward/std": 3.705827236175537, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 662.0, + "completions/mean_terminated_length": 640.0000610351562, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.5601208459214502, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03913164883852005, + "learning_rate": 1.4965502070695716e-06, + "loss": 0.0022, + "num_tokens": 157204890.0, + "reward": 2.591099977493286, + "reward_std": 2.023698568344116, + "rewards/accuracy_reward/mean": 1.8528187274932861, + "rewards/accuracy_reward/std": 3.273303508758545, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 428.6875, + "completions/mean_terminated_length": 428.6875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.5607250755287009, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01404188945889473, + "learning_rate": 1.4938700680705308e-06, + "loss": 0.0039, + "num_tokens": 157339702.0, + "reward": 4.352132797241211, + "reward_std": 0.47116297483444214, + "rewards/accuracy_reward/mean": 3.602132797241211, + "rewards/accuracy_reward/std": 3.745950222015381, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 523.46875, + "completions/mean_terminated_length": 523.46875, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.5613293051359517, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024856427684426308, + "learning_rate": 1.4911905526349927e-06, + "loss": 0.0018, + "num_tokens": 157508100.0, + "reward": 2.8553829193115234, + "reward_std": 0.6749151349067688, + "rewards/accuracy_reward/mean": 2.1053829193115234, + "rewards/accuracy_reward/std": 3.3599085807800293, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 511.546875, + "completions/mean_terminated_length": 511.546875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.5619335347432024, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045641809701919556, + "learning_rate": 1.4885116714646078e-06, + "loss": -0.0064, + "num_tokens": 157668599.0, + "reward": 5.48318338394165, + "reward_std": 2.458768844604492, + "rewards/accuracy_reward/mean": 4.733182907104492, + "rewards/accuracy_reward/std": 3.5735056400299072, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 541.828125, + "completions/mean_terminated_length": 517.920654296875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.5625377643504532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052524372935295105, + "learning_rate": 1.4858334352584938e-06, + "loss": 0.0162, + "num_tokens": 157865740.0, + "reward": 4.471828460693359, + "reward_std": 2.5606207847595215, + "rewards/accuracy_reward/mean": 3.7335469722747803, + "rewards/accuracy_reward/std": 3.846510171890259, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 506.71875, + "completions/mean_terminated_length": 506.71875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.5631419939577039, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03050101175904274, + "learning_rate": 1.483155854713193e-06, + "loss": -0.0092, + "num_tokens": 158024874.0, + "reward": 7.625376224517822, + "reward_std": 0.868943452835083, + "rewards/accuracy_reward/mean": 6.8753767013549805, + "rewards/accuracy_reward/std": 1.8589370250701904, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1200.0, + "completions/max_terminated_length": 1200.0, + "completions/mean_length": 649.375, + "completions/mean_terminated_length": 649.375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.5637462235649546, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0043929568491876125, + "learning_rate": 1.480478940522629e-06, + "loss": -0.0014, + "num_tokens": 158261362.0, + "reward": 2.556467056274414, + "reward_std": 0.15835553407669067, + "rewards/accuracy_reward/mean": 1.8064671754837036, + "rewards/accuracy_reward/std": 3.26963472366333, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1181.0, + "completions/mean_length": 640.296875, + "completions/mean_terminated_length": 617.952392578125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.5643504531722054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04538395255804062, + "learning_rate": 1.4778027033780628e-06, + "loss": -0.0163, + "num_tokens": 158448981.0, + "reward": 3.47196102142334, + "reward_std": 2.2569141387939453, + "rewards/accuracy_reward/mean": 2.74149227142334, + "rewards/accuracy_reward/std": 3.6565704345703125, + "rewards/tag_count_reward/mean": 0.73046875, + "rewards/tag_count_reward/std": 0.1118449866771698, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 644.890625, + "completions/mean_terminated_length": 644.890625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.5649546827794562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02879917435348034, + "learning_rate": 1.4751271539680526e-06, + "loss": 0.0126, + "num_tokens": 158622958.0, + "reward": 4.261811256408691, + "reward_std": 1.3636348247528076, + "rewards/accuracy_reward/mean": 3.5118110179901123, + "rewards/accuracy_reward/std": 3.6094563007354736, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 601.5, + "completions/mean_terminated_length": 601.5, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.565558912386707, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04295743629336357, + "learning_rate": 1.4724523029784097e-06, + "loss": -0.0259, + "num_tokens": 158791726.0, + "reward": 5.462362289428711, + "reward_std": 1.9509129524230957, + "rewards/accuracy_reward/mean": 4.712362766265869, + "rewards/accuracy_reward/std": 3.559250593185425, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 533.671875, + "completions/mean_terminated_length": 533.671875, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.5661631419939577, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029615765437483788, + "learning_rate": 1.4697781610921552e-06, + "loss": -0.001, + "num_tokens": 158937225.0, + "reward": 5.757462501525879, + "reward_std": 1.4645493030548096, + "rewards/accuracy_reward/mean": 5.007462501525879, + "rewards/accuracy_reward/std": 3.5336904525756836, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 542.609375, + "completions/mean_terminated_length": 542.609375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.5667673716012085, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03069002367556095, + "learning_rate": 1.4671047389894795e-06, + "loss": -0.0287, + "num_tokens": 159102288.0, + "reward": 4.8050360679626465, + "reward_std": 2.1266605854034424, + "rewards/accuracy_reward/mean": 4.0550360679626465, + "rewards/accuracy_reward/std": 3.6507766246795654, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 567.015625, + "completions/mean_terminated_length": 567.015625, + "completions/min_length": 399.0, + "completions/min_terminated_length": 399.0, + "epoch": 0.5673716012084592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07233448326587677, + "learning_rate": 1.4644320473476969e-06, + "loss": 0.0405, + "num_tokens": 159261921.0, + "reward": 4.462932586669922, + "reward_std": 3.1193976402282715, + "rewards/accuracy_reward/mean": 3.712932825088501, + "rewards/accuracy_reward/std": 3.742523193359375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 544.6875, + "completions/mean_terminated_length": 544.6875, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.56797583081571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05231642350554466, + "learning_rate": 1.461760096841205e-06, + "loss": -0.0096, + "num_tokens": 159415069.0, + "reward": 3.041100025177002, + "reward_std": 2.618455171585083, + "rewards/accuracy_reward/mean": 2.291100025177002, + "rewards/accuracy_reward/std": 3.4614319801330566, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 486.296875, + "completions/mean_terminated_length": 486.296875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.5685800604229607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.022377274930477142, + "learning_rate": 1.4590888981414417e-06, + "loss": 0.0069, + "num_tokens": 159611552.0, + "reward": 6.23519229888916, + "reward_std": 0.5361616611480713, + "rewards/accuracy_reward/mean": 5.48519229888916, + "rewards/accuracy_reward/std": 3.1091599464416504, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 454.328125, + "completions/mean_terminated_length": 454.328125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.5691842900302114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.015261244960129261, + "learning_rate": 1.456418461916842e-06, + "loss": -0.0059, + "num_tokens": 159779349.0, + "reward": 6.215690612792969, + "reward_std": 0.5686363577842712, + "rewards/accuracy_reward/mean": 5.465690612792969, + "rewards/accuracy_reward/std": 3.6062264442443848, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 490.8125, + "completions/mean_terminated_length": 490.8125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.5697885196374622, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.025415997952222824, + "learning_rate": 1.4537487988327945e-06, + "loss": -0.0007, + "num_tokens": 159949993.0, + "reward": 5.981298446655273, + "reward_std": 0.7518050074577332, + "rewards/accuracy_reward/mean": 5.231298446655273, + "rewards/accuracy_reward/std": 3.413029193878174, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 518.53125, + "completions/mean_terminated_length": 518.53125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.570392749244713, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03960578888654709, + "learning_rate": 1.4510799195516027e-06, + "loss": -0.0049, + "num_tokens": 160107515.0, + "reward": 3.961193561553955, + "reward_std": 1.3468263149261475, + "rewards/accuracy_reward/mean": 3.211193561553955, + "rewards/accuracy_reward/std": 3.7710139751434326, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 533.5625, + "completions/mean_terminated_length": 533.5625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.5709969788519638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042883530259132385, + "learning_rate": 1.4484118347324365e-06, + "loss": 0.0122, + "num_tokens": 160317551.0, + "reward": 4.694921493530273, + "reward_std": 1.8348666429519653, + "rewards/accuracy_reward/mean": 3.9449219703674316, + "rewards/accuracy_reward/std": 3.768559455871582, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 512.25, + "completions/mean_terminated_length": 512.25, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.5716012084592145, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05279234051704407, + "learning_rate": 1.4457445550312955e-06, + "loss": 0.0291, + "num_tokens": 160469551.0, + "reward": 4.449649810791016, + "reward_std": 2.617382287979126, + "rewards/accuracy_reward/mean": 3.7113685607910156, + "rewards/accuracy_reward/std": 3.74128794670105, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 552.5625, + "completions/mean_terminated_length": 552.5625, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.5722054380664653, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002565003465861082, + "learning_rate": 1.443078091100962e-06, + "loss": 0.0003, + "num_tokens": 160704019.0, + "reward": 2.600431203842163, + "reward_std": 0.09619924426078796, + "rewards/accuracy_reward/mean": 1.850431203842163, + "rewards/accuracy_reward/std": 3.254472494125366, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 573.765625, + "completions/mean_terminated_length": 573.765625, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.572809667673716, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03871718421578407, + "learning_rate": 1.4404124535909613e-06, + "loss": 0.01, + "num_tokens": 160870724.0, + "reward": 7.336806297302246, + "reward_std": 1.490092396736145, + "rewards/accuracy_reward/mean": 6.586806297302246, + "rewards/accuracy_reward/std": 2.432368755340576, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1124.0, + "completions/max_terminated_length": 1124.0, + "completions/mean_length": 530.875, + "completions/mean_terminated_length": 530.875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.5734138972809668, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.031081771478056908, + "learning_rate": 1.4377476531475171e-06, + "loss": 0.0063, + "num_tokens": 161020828.0, + "reward": 2.0375876426696777, + "reward_std": 1.418951392173767, + "rewards/accuracy_reward/mean": 1.2875874042510986, + "rewards/accuracy_reward/std": 2.6526288986206055, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 543.1875, + "completions/mean_terminated_length": 543.1875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.5740181268882175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030129732564091682, + "learning_rate": 1.435083700413511e-06, + "loss": -0.0189, + "num_tokens": 161186296.0, + "reward": 5.9861159324646, + "reward_std": 1.3064844608306885, + "rewards/accuracy_reward/mean": 5.236115455627441, + "rewards/accuracy_reward/std": 3.4293060302734375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 550.609375, + "completions/mean_terminated_length": 526.84130859375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.5746223564954682, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03100818395614624, + "learning_rate": 1.4324206060284383e-06, + "loss": -0.0501, + "num_tokens": 161362943.0, + "reward": 3.9531264305114746, + "reward_std": 0.9397454857826233, + "rewards/accuracy_reward/mean": 3.2265641689300537, + "rewards/accuracy_reward/std": 3.755115032196045, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 512.3125, + "completions/mean_terminated_length": 512.3125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.575226586102719, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02166815847158432, + "learning_rate": 1.4297583806283662e-06, + "loss": 0.0108, + "num_tokens": 161599219.0, + "reward": 4.297303199768066, + "reward_std": 0.5878245830535889, + "rewards/accuracy_reward/mean": 3.5473031997680664, + "rewards/accuracy_reward/std": 3.756284475326538, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 520.375, + "completions/mean_terminated_length": 520.375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.5758308157099697, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027026576921343803, + "learning_rate": 1.4270970348458913e-06, + "loss": 0.0017, + "num_tokens": 161768203.0, + "reward": 2.7290844917297363, + "reward_std": 1.1603765487670898, + "rewards/accuracy_reward/mean": 1.9790844917297363, + "rewards/accuracy_reward/std": 3.5739476680755615, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 649.671875, + "completions/mean_terminated_length": 649.671875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.5764350453172206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05085688829421997, + "learning_rate": 1.4244365793100965e-06, + "loss": -0.0099, + "num_tokens": 161959318.0, + "reward": 2.7822937965393066, + "reward_std": 2.6330783367156982, + "rewards/accuracy_reward/mean": 2.0322937965393066, + "rewards/accuracy_reward/std": 3.3709871768951416, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 593.71875, + "completions/mean_terminated_length": 593.71875, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.5770392749244713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03514181450009346, + "learning_rate": 1.4217770246465112e-06, + "loss": -0.0335, + "num_tokens": 162127172.0, + "reward": 3.1903157234191895, + "reward_std": 1.3484553098678589, + "rewards/accuracy_reward/mean": 2.4403157234191895, + "rewards/accuracy_reward/std": 3.469099760055542, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 955.0, + "completions/max_terminated_length": 955.0, + "completions/mean_length": 616.671875, + "completions/mean_terminated_length": 616.671875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.5776435045317221, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0258555319160223, + "learning_rate": 1.419118381477065e-06, + "loss": 0.0015, + "num_tokens": 162299103.0, + "reward": 4.6757378578186035, + "reward_std": 1.0676528215408325, + "rewards/accuracy_reward/mean": 3.9257373809814453, + "rewards/accuracy_reward/std": 3.6389124393463135, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 503.4375, + "completions/mean_terminated_length": 503.4375, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.5782477341389728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.014642651192843914, + "learning_rate": 1.416460660420047e-06, + "loss": -0.0044, + "num_tokens": 162463611.0, + "reward": 8.102166175842285, + "reward_std": 0.45770537853240967, + "rewards/accuracy_reward/mean": 7.352167129516602, + "rewards/accuracy_reward/std": 0.8760078549385071, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 511.546875, + "completions/mean_terminated_length": 511.546875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.5788519637462236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04667382314801216, + "learning_rate": 1.4138038720900644e-06, + "loss": 0.0279, + "num_tokens": 162584958.0, + "reward": 1.7952265739440918, + "reward_std": 1.5852792263031006, + "rewards/accuracy_reward/mean": 1.0452265739440918, + "rewards/accuracy_reward/std": 2.604323387145996, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 568.578125, + "completions/mean_terminated_length": 545.0952758789062, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.5794561933534743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045202262699604034, + "learning_rate": 1.4111480270979994e-06, + "loss": -0.0383, + "num_tokens": 162754003.0, + "reward": 7.055817604064941, + "reward_std": 2.690614938735962, + "rewards/accuracy_reward/mean": 6.317536354064941, + "rewards/accuracy_reward/std": 2.7178192138671875, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 544.453125, + "completions/mean_terminated_length": 544.453125, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.5800604229607251, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03701598569750786, + "learning_rate": 1.4084931360509656e-06, + "loss": 0.0172, + "num_tokens": 162936688.0, + "reward": 6.811282634735107, + "reward_std": 2.1803345680236816, + "rewards/accuracy_reward/mean": 6.061282634735107, + "rewards/accuracy_reward/std": 2.9349024295806885, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 547.40625, + "completions/mean_terminated_length": 547.40625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.5806646525679758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025015179067850113, + "learning_rate": 1.4058392095522674e-06, + "loss": 0.0189, + "num_tokens": 163114778.0, + "reward": 5.109681606292725, + "reward_std": 0.9714531302452087, + "rewards/accuracy_reward/mean": 4.359681129455566, + "rewards/accuracy_reward/std": 3.6465628147125244, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 765.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 502.984375, + "completions/mean_terminated_length": 502.984375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.5812688821752265, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020722011104226112, + "learning_rate": 1.4031862582013568e-06, + "loss": 0.0058, + "num_tokens": 163263785.0, + "reward": 4.22599983215332, + "reward_std": 0.9936971664428711, + "rewards/accuracy_reward/mean": 3.4759998321533203, + "rewards/accuracy_reward/std": 3.7633931636810303, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 776.0, + "completions/max_terminated_length": 776.0, + "completions/mean_length": 503.65625, + "completions/mean_terminated_length": 503.65625, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.5818731117824774, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04224028438329697, + "learning_rate": 1.400534292593791e-06, + "loss": 0.0065, + "num_tokens": 163394259.0, + "reward": 5.562845230102539, + "reward_std": 1.6252214908599854, + "rewards/accuracy_reward/mean": 4.812845230102539, + "rewards/accuracy_reward/std": 3.6163933277130127, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 458.1875, + "completions/mean_terminated_length": 458.1875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.5824773413897281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02319490723311901, + "learning_rate": 1.39788332332119e-06, + "loss": -0.0028, + "num_tokens": 163523903.0, + "reward": 6.564402103424072, + "reward_std": 1.2577463388442993, + "rewards/accuracy_reward/mean": 5.814401626586914, + "rewards/accuracy_reward/std": 2.9777660369873047, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 549.96875, + "completions/mean_terminated_length": 549.96875, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.5830815709969789, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.036960579454898834, + "learning_rate": 1.3952333609711952e-06, + "loss": 0.0223, + "num_tokens": 163660365.0, + "reward": 5.83118200302124, + "reward_std": 1.5623289346694946, + "rewards/accuracy_reward/mean": 5.08118200302124, + "rewards/accuracy_reward/std": 3.377429485321045, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 711.078125, + "completions/mean_terminated_length": 689.857177734375, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "epoch": 0.5836858006042296, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.047965019941329956, + "learning_rate": 1.3925844161274264e-06, + "loss": -0.0804, + "num_tokens": 163836354.0, + "reward": 2.9233484268188477, + "reward_std": 1.6500158309936523, + "rewards/accuracy_reward/mean": 2.1850671768188477, + "rewards/accuracy_reward/std": 3.4626848697662354, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1125.0, + "completions/max_terminated_length": 1125.0, + "completions/mean_length": 689.609375, + "completions/mean_terminated_length": 689.609375, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.5842900302114804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03273431211709976, + "learning_rate": 1.3899364993694387e-06, + "loss": 0.0085, + "num_tokens": 163994713.0, + "reward": 6.1161041259765625, + "reward_std": 1.8344496488571167, + "rewards/accuracy_reward/mean": 5.366104602813721, + "rewards/accuracy_reward/std": 3.368708372116089, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 613.578125, + "completions/mean_terminated_length": 567.3064575195312, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.5848942598187311, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.051801882684230804, + "learning_rate": 1.387289621272683e-06, + "loss": 0.0015, + "num_tokens": 164244446.0, + "reward": 2.077171802520752, + "reward_std": 2.474621534347534, + "rewards/accuracy_reward/mean": 1.350609302520752, + "rewards/accuracy_reward/std": 3.2788162231445312, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 567.0, + "completions/mean_terminated_length": 543.4920654296875, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.5854984894259819, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03897348418831825, + "learning_rate": 1.3846437924084593e-06, + "loss": -0.026, + "num_tokens": 164386702.0, + "reward": 4.908808708190918, + "reward_std": 1.8786985874176025, + "rewards/accuracy_reward/mean": 4.170528411865234, + "rewards/accuracy_reward/std": 3.7410166263580322, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1946.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 734.890625, + "completions/mean_terminated_length": 734.890625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.5861027190332326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032525721937417984, + "learning_rate": 1.381999023343879e-06, + "loss": 0.0042, + "num_tokens": 164556055.0, + "reward": 1.4160171747207642, + "reward_std": 1.782602071762085, + "rewards/accuracy_reward/mean": 0.6660171747207642, + "rewards/accuracy_reward/std": 2.0042285919189453, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 516.203125, + "completions/mean_terminated_length": 516.203125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.5867069486404833, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03425125777721405, + "learning_rate": 1.3793553246418219e-06, + "loss": -0.0128, + "num_tokens": 164702292.0, + "reward": 4.971823215484619, + "reward_std": 1.2996941804885864, + "rewards/accuracy_reward/mean": 4.221823692321777, + "rewards/accuracy_reward/std": 3.5534634590148926, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 565.09375, + "completions/mean_terminated_length": 565.09375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.5873111782477342, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026494139805436134, + "learning_rate": 1.376712706860888e-06, + "loss": -0.0, + "num_tokens": 164837978.0, + "reward": 6.032624244689941, + "reward_std": 1.061694860458374, + "rewards/accuracy_reward/mean": 5.284577369689941, + "rewards/accuracy_reward/std": 3.4067330360412598, + "rewards/tag_count_reward/mean": 0.748046875, + "rewards/tag_count_reward/std": 0.015625, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 585.890625, + "completions/mean_terminated_length": 585.890625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.5879154078549849, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05200257897377014, + "learning_rate": 1.3740711805553675e-06, + "loss": 0.0105, + "num_tokens": 165015651.0, + "reward": 2.2424890995025635, + "reward_std": 2.628811836242676, + "rewards/accuracy_reward/mean": 1.4924890995025635, + "rewards/accuracy_reward/std": 3.3617618083953857, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 522.828125, + "completions/mean_terminated_length": 522.828125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.5885196374622357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030852211639285088, + "learning_rate": 1.3714307562751848e-06, + "loss": 0.0025, + "num_tokens": 165192136.0, + "reward": 5.8482160568237305, + "reward_std": 1.2793211936950684, + "rewards/accuracy_reward/mean": 5.098215579986572, + "rewards/accuracy_reward/std": 3.501038074493408, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 497.4375, + "completions/mean_terminated_length": 497.4375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.5891238670694864, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03193473443388939, + "learning_rate": 1.3687914445658667e-06, + "loss": 0.0187, + "num_tokens": 165329348.0, + "reward": 5.795265197753906, + "reward_std": 1.2607675790786743, + "rewards/accuracy_reward/mean": 5.0452656745910645, + "rewards/accuracy_reward/std": 3.495155096054077, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 515.703125, + "completions/mean_terminated_length": 515.703125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.5897280966767372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033573608845472336, + "learning_rate": 1.3661532559684952e-06, + "loss": 0.0173, + "num_tokens": 165469137.0, + "reward": 5.8786420822143555, + "reward_std": 1.3074227571487427, + "rewards/accuracy_reward/mean": 5.1286420822143555, + "rewards/accuracy_reward/std": 3.473351001739502, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 527.640625, + "completions/mean_terminated_length": 527.640625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.5903323262839879, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03127802908420563, + "learning_rate": 1.363516201019667e-06, + "loss": -0.0075, + "num_tokens": 165633722.0, + "reward": 2.9702234268188477, + "reward_std": 1.7051866054534912, + "rewards/accuracy_reward/mean": 2.2280359268188477, + "rewards/accuracy_reward/std": 3.4402308464050293, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 537.890625, + "completions/mean_terminated_length": 537.890625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.5909365558912387, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04599827155470848, + "learning_rate": 1.360880290251451e-06, + "loss": -0.0382, + "num_tokens": 165763283.0, + "reward": 6.585070610046387, + "reward_std": 1.2301366329193115, + "rewards/accuracy_reward/mean": 5.8350701332092285, + "rewards/accuracy_reward/std": 3.0770933628082275, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 511.296875, + "completions/mean_terminated_length": 511.296875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.5915407854984894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04313879460096359, + "learning_rate": 1.3582455341913468e-06, + "loss": 0.0054, + "num_tokens": 165934774.0, + "reward": 3.8676717281341553, + "reward_std": 2.0634994506835938, + "rewards/accuracy_reward/mean": 3.1176719665527344, + "rewards/accuracy_reward/std": 3.6787400245666504, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 564.609375, + "completions/mean_terminated_length": 564.609375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.5921450151057401, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030208278447389603, + "learning_rate": 1.355611943362241e-06, + "loss": -0.0053, + "num_tokens": 166090109.0, + "reward": 3.1444876194000244, + "reward_std": 1.284603238105774, + "rewards/accuracy_reward/mean": 2.3944876194000244, + "rewards/accuracy_reward/std": 3.440289258956909, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 988.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 479.765625, + "completions/mean_terminated_length": 479.765625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.592749244712991, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04482696205377579, + "learning_rate": 1.352979528282369e-06, + "loss": 0.0142, + "num_tokens": 166229982.0, + "reward": 3.2636704444885254, + "reward_std": 1.6557819843292236, + "rewards/accuracy_reward/mean": 2.5175764560699463, + "rewards/accuracy_reward/std": 3.576914072036743, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1315.0, + "completions/max_terminated_length": 1315.0, + "completions/mean_length": 747.484375, + "completions/mean_terminated_length": 747.484375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.5933534743202417, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04106980189681053, + "learning_rate": 1.3503482994652678e-06, + "loss": 0.0062, + "num_tokens": 166396221.0, + "reward": 4.017316818237305, + "reward_std": 1.6842069625854492, + "rewards/accuracy_reward/mean": 3.267317295074463, + "rewards/accuracy_reward/std": 3.611219644546509, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1032.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 619.671875, + "completions/mean_terminated_length": 619.671875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.5939577039274925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0384136363863945, + "learning_rate": 1.3477182674197373e-06, + "loss": 0.0105, + "num_tokens": 166562024.0, + "reward": 3.9826340675354004, + "reward_std": 2.0764784812927246, + "rewards/accuracy_reward/mean": 3.2326343059539795, + "rewards/accuracy_reward/std": 3.6143879890441895, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 554.09375, + "completions/mean_terminated_length": 554.09375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.5945619335347432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041840024292469025, + "learning_rate": 1.3450894426497986e-06, + "loss": 0.0045, + "num_tokens": 166732494.0, + "reward": 3.8068931102752686, + "reward_std": 2.064640998840332, + "rewards/accuracy_reward/mean": 3.0568931102752686, + "rewards/accuracy_reward/std": 3.7757041454315186, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 529.265625, + "completions/mean_terminated_length": 529.265625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.595166163141994, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.022847097367048264, + "learning_rate": 1.3424618356546497e-06, + "loss": 0.0083, + "num_tokens": 166894447.0, + "reward": 4.870169162750244, + "reward_std": 0.8724104762077332, + "rewards/accuracy_reward/mean": 4.124075412750244, + "rewards/accuracy_reward/std": 3.676179885864258, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 598.0625, + "completions/mean_terminated_length": 598.0625, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.5957703927492447, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.048591263592243195, + "learning_rate": 1.339835456928626e-06, + "loss": 0.0078, + "num_tokens": 167092595.0, + "reward": 4.237331390380859, + "reward_std": 1.7658226490020752, + "rewards/accuracy_reward/mean": 3.4873313903808594, + "rewards/accuracy_reward/std": 3.741983652114868, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 579.84375, + "completions/mean_terminated_length": 579.84375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.5963746223564955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06007733941078186, + "learning_rate": 1.3372103169611577e-06, + "loss": 0.045, + "num_tokens": 167249593.0, + "reward": 4.015840530395508, + "reward_std": 3.245729446411133, + "rewards/accuracy_reward/mean": 3.265840530395508, + "rewards/accuracy_reward/std": 3.6912665367126465, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 849.0, + "completions/max_terminated_length": 849.0, + "completions/mean_length": 542.078125, + "completions/mean_terminated_length": 542.078125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.5969788519637462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037025921046733856, + "learning_rate": 1.3345864262367258e-06, + "loss": -0.0072, + "num_tokens": 167383966.0, + "reward": 5.352081298828125, + "reward_std": 1.0509897470474243, + "rewards/accuracy_reward/mean": 4.602081298828125, + "rewards/accuracy_reward/std": 3.6622262001037598, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 559.859375, + "completions/mean_terminated_length": 559.859375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.5975830815709969, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047340236604213715, + "learning_rate": 1.331963795234824e-06, + "loss": -0.039, + "num_tokens": 167624773.0, + "reward": 3.1176156997680664, + "reward_std": 2.4103386402130127, + "rewards/accuracy_reward/mean": 2.3676156997680664, + "rewards/accuracy_reward/std": 3.7134711742401123, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 877.0, + "completions/max_terminated_length": 877.0, + "completions/mean_length": 563.6875, + "completions/mean_terminated_length": 563.6875, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.5981873111782477, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022256974130868912, + "learning_rate": 1.3293424344299134e-06, + "loss": 0.0145, + "num_tokens": 167840161.0, + "reward": 6.098262786865234, + "reward_std": 0.6503719091415405, + "rewards/accuracy_reward/mean": 5.348262310028076, + "rewards/accuracy_reward/std": 3.372149705886841, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 558.40625, + "completions/mean_terminated_length": 558.40625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.5987915407854985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05057366564869881, + "learning_rate": 1.3267223542913824e-06, + "loss": 0.0264, + "num_tokens": 168074027.0, + "reward": 4.826269149780273, + "reward_std": 2.5904712677001953, + "rewards/accuracy_reward/mean": 4.076269149780273, + "rewards/accuracy_reward/std": 3.7131900787353516, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 607.8125, + "completions/mean_terminated_length": 607.8125, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.5993957703927493, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027237189933657646, + "learning_rate": 1.3241035652835048e-06, + "loss": -0.0076, + "num_tokens": 168222159.0, + "reward": 3.8894546031951904, + "reward_std": 0.9456857442855835, + "rewards/accuracy_reward/mean": 3.1394548416137695, + "rewards/accuracy_reward/std": 3.7188212871551514, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 547.3125, + "completions/mean_terminated_length": 547.3125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.6, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04270794987678528, + "learning_rate": 1.3214860778653983e-06, + "loss": 0.0038, + "num_tokens": 168409683.0, + "reward": 3.8421030044555664, + "reward_std": 1.805898666381836, + "rewards/accuracy_reward/mean": 3.0921030044555664, + "rewards/accuracy_reward/std": 3.682518720626831, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 615.859375, + "completions/mean_terminated_length": 615.859375, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.6006042296072508, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053538721054792404, + "learning_rate": 1.318869902490981e-06, + "loss": 0.0159, + "num_tokens": 168553402.0, + "reward": 6.786632537841797, + "reward_std": 2.844874382019043, + "rewards/accuracy_reward/mean": 6.036633014678955, + "rewards/accuracy_reward/std": 2.9657657146453857, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1168.0, + "completions/max_terminated_length": 1168.0, + "completions/mean_length": 732.390625, + "completions/mean_terminated_length": 732.390625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.6012084592145015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03986818343400955, + "learning_rate": 1.3162550496089317e-06, + "loss": 0.0154, + "num_tokens": 168701299.0, + "reward": 3.1930108070373535, + "reward_std": 1.9434056282043457, + "rewards/accuracy_reward/mean": 2.4469170570373535, + "rewards/accuracy_reward/std": 3.5424649715423584, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 592.1875, + "completions/mean_terminated_length": 592.1875, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.6018126888217523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045136742293834686, + "learning_rate": 1.313641529662647e-06, + "loss": -0.0022, + "num_tokens": 168937055.0, + "reward": 4.78920841217041, + "reward_std": 1.9525872468948364, + "rewards/accuracy_reward/mean": 4.03920841217041, + "rewards/accuracy_reward/std": 3.7802164554595947, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 498.3125, + "completions/mean_terminated_length": 498.3125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.602416918429003, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03006085567176342, + "learning_rate": 1.3110293530902004e-06, + "loss": -0.0088, + "num_tokens": 169088307.0, + "reward": 2.1547281742095947, + "reward_std": 1.3473759889602661, + "rewards/accuracy_reward/mean": 1.4047280550003052, + "rewards/accuracy_reward/std": 2.929795980453491, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1901.0, + "completions/mean_length": 812.84375, + "completions/mean_terminated_length": 752.0983276367188, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "epoch": 0.6030211480362537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0498143769800663, + "learning_rate": 1.3084185303242998e-06, + "loss": 0.024, + "num_tokens": 169258313.0, + "reward": 2.9460108280181885, + "reward_std": 2.484971284866333, + "rewards/accuracy_reward/mean": 2.2311670780181885, + "rewards/accuracy_reward/std": 3.4324147701263428, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 538.9375, + "completions/mean_terminated_length": 538.9375, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.6036253776435045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04215864837169647, + "learning_rate": 1.3058090717922452e-06, + "loss": -0.0042, + "num_tokens": 169397813.0, + "reward": 5.280721664428711, + "reward_std": 1.4173216819763184, + "rewards/accuracy_reward/mean": 4.530721664428711, + "rewards/accuracy_reward/std": 3.6563591957092285, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 619.9375, + "completions/mean_terminated_length": 597.2698974609375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.6042296072507553, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04433785751461983, + "learning_rate": 1.3032009879158905e-06, + "loss": -0.0239, + "num_tokens": 169577457.0, + "reward": 5.309462547302246, + "reward_std": 2.010148048400879, + "rewards/accuracy_reward/mean": 4.571181297302246, + "rewards/accuracy_reward/std": 3.6305158138275146, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1695.0, + "completions/max_terminated_length": 1695.0, + "completions/mean_length": 749.25, + "completions/mean_terminated_length": 749.25, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "epoch": 0.6048338368580061, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004354197531938553, + "learning_rate": 1.3005942891115968e-06, + "loss": -0.0003, + "num_tokens": 169780305.0, + "reward": 2.677389144897461, + "reward_std": 0.12121033668518066, + "rewards/accuracy_reward/mean": 1.927389144897461, + "rewards/accuracy_reward/std": 3.2186503410339355, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 608.40625, + "completions/mean_terminated_length": 608.40625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.6054380664652568, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.019728606566786766, + "learning_rate": 1.2979889857901952e-06, + "loss": -0.0068, + "num_tokens": 169969371.0, + "reward": 0.9949187636375427, + "reward_std": 0.7537817358970642, + "rewards/accuracy_reward/mean": 0.24491874873638153, + "rewards/accuracy_reward/std": 1.3280363082885742, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1242.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 600.640625, + "completions/mean_terminated_length": 600.640625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.6060422960725076, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03422398120164871, + "learning_rate": 1.2953850883569418e-06, + "loss": 0.0355, + "num_tokens": 170148804.0, + "reward": 5.851517200469971, + "reward_std": 1.3503540754318237, + "rewards/accuracy_reward/mean": 5.101517200469971, + "rewards/accuracy_reward/std": 3.4017083644866943, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 574.65625, + "completions/mean_terminated_length": 574.65625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.6066465256797583, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028537781909108162, + "learning_rate": 1.2927826072114794e-06, + "loss": 0.0099, + "num_tokens": 170278814.0, + "reward": 4.46632194519043, + "reward_std": 0.93289715051651, + "rewards/accuracy_reward/mean": 3.7163219451904297, + "rewards/accuracy_reward/std": 3.745783567428589, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 585.46875, + "completions/mean_terminated_length": 585.46875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.6072507552870091, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04575030133128166, + "learning_rate": 1.2901815527477935e-06, + "loss": 0.015, + "num_tokens": 170458796.0, + "reward": 4.989710807800293, + "reward_std": 1.6584842205047607, + "rewards/accuracy_reward/mean": 4.239710807800293, + "rewards/accuracy_reward/std": 3.7185451984405518, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 541.734375, + "completions/mean_terminated_length": 541.734375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.6078549848942598, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045442234724760056, + "learning_rate": 1.2875819353541713e-06, + "loss": 0.0041, + "num_tokens": 170698811.0, + "reward": 7.01437520980835, + "reward_std": 1.9177055358886719, + "rewards/accuracy_reward/mean": 6.26437520980835, + "rewards/accuracy_reward/std": 2.7631685733795166, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 650.34375, + "completions/mean_terminated_length": 628.1587524414062, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.6084592145015105, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0010936919134110212, + "learning_rate": 1.2849837654131605e-06, + "loss": -0.0065, + "num_tokens": 170865841.0, + "reward": 4.5616326332092285, + "reward_std": 0.13303428888320923, + "rewards/accuracy_reward/mean": 3.8233516216278076, + "rewards/accuracy_reward/std": 3.6717336177825928, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 515.78125, + "completions/mean_terminated_length": 515.78125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.6090634441087613, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03868729621171951, + "learning_rate": 1.2823870533015295e-06, + "loss": -0.0044, + "num_tokens": 171060899.0, + "reward": 2.1913156509399414, + "reward_std": 0.8391492962837219, + "rewards/accuracy_reward/mean": 1.4413156509399414, + "rewards/accuracy_reward/std": 3.002272129058838, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 451.09375, + "completions/mean_terminated_length": 451.09375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.609667673716012, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01130509003996849, + "learning_rate": 1.279791809390222e-06, + "loss": 0.0001, + "num_tokens": 171185737.0, + "reward": 6.186186790466309, + "reward_std": 0.4782644212245941, + "rewards/accuracy_reward/mean": 5.436186790466309, + "rewards/accuracy_reward/std": 3.2957799434661865, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 523.796875, + "completions/mean_terminated_length": 523.796875, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.6102719033232629, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022335266694426537, + "learning_rate": 1.2771980440443188e-06, + "loss": 0.0243, + "num_tokens": 171334188.0, + "reward": 2.5717923641204834, + "reward_std": 1.0146706104278564, + "rewards/accuracy_reward/mean": 1.8217921257019043, + "rewards/accuracy_reward/std": 3.2578353881835938, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 594.96875, + "completions/mean_terminated_length": 594.96875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.6108761329305136, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018778325989842415, + "learning_rate": 1.274605767622997e-06, + "loss": 0.0076, + "num_tokens": 171532970.0, + "reward": 4.386843681335449, + "reward_std": 0.5994448065757751, + "rewards/accuracy_reward/mean": 3.636843681335449, + "rewards/accuracy_reward/std": 3.700112819671631, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 561.359375, + "completions/mean_terminated_length": 561.359375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6114803625377644, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02728140540421009, + "learning_rate": 1.2720149904794846e-06, + "loss": 0.0089, + "num_tokens": 171681905.0, + "reward": 3.9732155799865723, + "reward_std": 0.8396685719490051, + "rewards/accuracy_reward/mean": 3.2232155799865723, + "rewards/accuracy_reward/std": 3.6838948726654053, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 448.859375, + "completions/mean_terminated_length": 448.859375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.6120845921450151, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05532919242978096, + "learning_rate": 1.2694257229610226e-06, + "loss": 0.0473, + "num_tokens": 171862872.0, + "reward": 6.018365383148193, + "reward_std": 2.5826616287231445, + "rewards/accuracy_reward/mean": 5.268365383148193, + "rewards/accuracy_reward/std": 3.469348192214966, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 543.421875, + "completions/mean_terminated_length": 543.421875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.6126888217522659, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037767745554447174, + "learning_rate": 1.266837975408824e-06, + "loss": 0.0215, + "num_tokens": 172002547.0, + "reward": 7.232465744018555, + "reward_std": 0.9763462543487549, + "rewards/accuracy_reward/mean": 6.482465744018555, + "rewards/accuracy_reward/std": 2.449190139770508, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 590.3125, + "completions/mean_terminated_length": 590.3125, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.6132930513595166, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05077841505408287, + "learning_rate": 1.26425175815803e-06, + "loss": 0.0153, + "num_tokens": 172221335.0, + "reward": 3.588545083999634, + "reward_std": 2.235440731048584, + "rewards/accuracy_reward/mean": 2.838545322418213, + "rewards/accuracy_reward/std": 3.588048219680786, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 543.515625, + "completions/mean_terminated_length": 543.515625, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.6138972809667673, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05187322571873665, + "learning_rate": 1.2616670815376697e-06, + "loss": 0.0342, + "num_tokens": 172372136.0, + "reward": 5.778715133666992, + "reward_std": 2.5145153999328613, + "rewards/accuracy_reward/mean": 5.028715133666992, + "rewards/accuracy_reward/std": 3.485184907913208, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1009.0, + "completions/max_terminated_length": 1009.0, + "completions/mean_length": 516.53125, + "completions/mean_terminated_length": 516.53125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.6145015105740181, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029365280643105507, + "learning_rate": 1.25908395587062e-06, + "loss": -0.0013, + "num_tokens": 172511034.0, + "reward": 3.3085060119628906, + "reward_std": 1.3601762056350708, + "rewards/accuracy_reward/mean": 2.5585062503814697, + "rewards/accuracy_reward/std": 3.5631043910980225, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 466.21875, + "completions/mean_terminated_length": 466.21875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.6151057401812688, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04292844235897064, + "learning_rate": 1.2565023914735626e-06, + "loss": 0.0049, + "num_tokens": 172682008.0, + "reward": 5.943709373474121, + "reward_std": 2.1166951656341553, + "rewards/accuracy_reward/mean": 5.193709373474121, + "rewards/accuracy_reward/std": 3.5652976036071777, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 518.125, + "completions/mean_terminated_length": 518.125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.6157099697885197, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048501547425985336, + "learning_rate": 1.2539223986569451e-06, + "loss": 0.0115, + "num_tokens": 172813904.0, + "reward": 6.343008995056152, + "reward_std": 2.5447630882263184, + "rewards/accuracy_reward/mean": 5.5930094718933105, + "rewards/accuracy_reward/std": 3.2385807037353516, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 504.828125, + "completions/mean_terminated_length": 504.828125, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.6163141993957704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03498847037553787, + "learning_rate": 1.2513439877249363e-06, + "loss": 0.0405, + "num_tokens": 172982165.0, + "reward": 6.443079471588135, + "reward_std": 1.587099552154541, + "rewards/accuracy_reward/mean": 5.693079471588135, + "rewards/accuracy_reward/std": 3.175081253051758, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 2013.0, + "completions/mean_length": 623.359375, + "completions/mean_terminated_length": 600.74609375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6169184290030212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05658036842942238, + "learning_rate": 1.248767168975389e-06, + "loss": -0.0429, + "num_tokens": 173118156.0, + "reward": 6.530637264251709, + "reward_std": 2.537517547607422, + "rewards/accuracy_reward/mean": 5.792356014251709, + "rewards/accuracy_reward/std": 3.1698412895202637, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 527.421875, + "completions/mean_terminated_length": 527.421875, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.6175226586102719, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028703827410936356, + "learning_rate": 1.2461919526997964e-06, + "loss": -0.0101, + "num_tokens": 173258951.0, + "reward": 5.161438941955566, + "reward_std": 0.9427297115325928, + "rewards/accuracy_reward/mean": 4.411438941955566, + "rewards/accuracy_reward/std": 3.625284194946289, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 609.078125, + "completions/mean_terminated_length": 609.078125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.6181268882175227, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.019092900678515434, + "learning_rate": 1.2436183491832518e-06, + "loss": 0.0013, + "num_tokens": 173417980.0, + "reward": 2.352296829223633, + "reward_std": 0.739947497844696, + "rewards/accuracy_reward/mean": 1.6062030792236328, + "rewards/accuracy_reward/std": 3.1236863136291504, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 497.40625, + "completions/mean_terminated_length": 497.40625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.6187311178247734, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.042675990611314774, + "learning_rate": 1.2410463687044063e-06, + "loss": -0.003, + "num_tokens": 173623286.0, + "reward": 2.9364140033721924, + "reward_std": 1.538140058517456, + "rewards/accuracy_reward/mean": 2.1864140033721924, + "rewards/accuracy_reward/std": 3.4282498359680176, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1256.0, + "completions/max_terminated_length": 1256.0, + "completions/mean_length": 527.21875, + "completions/mean_terminated_length": 527.21875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.6193353474320241, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.061643585562705994, + "learning_rate": 1.2384760215354303e-06, + "loss": 0.0165, + "num_tokens": 173795396.0, + "reward": 5.793846130371094, + "reward_std": 3.1767382621765137, + "rewards/accuracy_reward/mean": 5.043846130371094, + "rewards/accuracy_reward/std": 3.6056642532348633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 526.953125, + "completions/mean_terminated_length": 526.953125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.6199395770392749, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03533288463950157, + "learning_rate": 1.2359073179419695e-06, + "loss": 0.0006, + "num_tokens": 173955905.0, + "reward": 5.866279602050781, + "reward_std": 1.221541166305542, + "rewards/accuracy_reward/mean": 5.120185852050781, + "rewards/accuracy_reward/std": 3.464700698852539, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1178.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 608.515625, + "completions/mean_terminated_length": 608.515625, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "epoch": 0.6205438066465256, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0422876812517643, + "learning_rate": 1.233340268183107e-06, + "loss": -0.0246, + "num_tokens": 174168466.0, + "reward": 5.5171403884887695, + "reward_std": 1.579638123512268, + "rewards/accuracy_reward/mean": 4.7671403884887695, + "rewards/accuracy_reward/std": 3.6337101459503174, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 500.59375, + "completions/mean_terminated_length": 500.59375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6211480362537765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048256874084472656, + "learning_rate": 1.2307748825113194e-06, + "loss": 0.0129, + "num_tokens": 174325288.0, + "reward": 4.3184919357299805, + "reward_std": 1.6362111568450928, + "rewards/accuracy_reward/mean": 3.5684919357299805, + "rewards/accuracy_reward/std": 3.613504648208618, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1004.0, + "completions/max_terminated_length": 1004.0, + "completions/mean_length": 589.640625, + "completions/mean_terminated_length": 589.640625, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.6217522658610272, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0382847897708416, + "learning_rate": 1.2282111711724378e-06, + "loss": 0.0164, + "num_tokens": 174494977.0, + "reward": 1.7965718507766724, + "reward_std": 1.4736907482147217, + "rewards/accuracy_reward/mean": 1.0465718507766724, + "rewards/accuracy_reward/std": 2.6162352561950684, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 575.9375, + "completions/mean_terminated_length": 575.9375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.622356495468278, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06104864552617073, + "learning_rate": 1.225649144405606e-06, + "loss": 0.0267, + "num_tokens": 174656909.0, + "reward": 5.851564407348633, + "reward_std": 2.6642367839813232, + "rewards/accuracy_reward/mean": 5.101563930511475, + "rewards/accuracy_reward/std": 3.45200252532959, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 478.9375, + "completions/mean_terminated_length": 478.9375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.6229607250755287, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.026126181706786156, + "learning_rate": 1.2230888124432388e-06, + "loss": -0.0067, + "num_tokens": 174786025.0, + "reward": 1.0857640504837036, + "reward_std": 0.751240611076355, + "rewards/accuracy_reward/mean": 0.335764080286026, + "rewards/accuracy_reward/std": 1.5790926218032837, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 591.328125, + "completions/mean_terminated_length": 591.328125, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.6235649546827795, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034161221235990524, + "learning_rate": 1.220530185510985e-06, + "loss": 0.0379, + "num_tokens": 174930638.0, + "reward": 5.961456298828125, + "reward_std": 0.8662195205688477, + "rewards/accuracy_reward/mean": 5.211456298828125, + "rewards/accuracy_reward/std": 3.486217498779297, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1094.0, + "completions/max_terminated_length": 1094.0, + "completions/mean_length": 665.578125, + "completions/mean_terminated_length": 665.578125, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.6241691842900302, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03512244299054146, + "learning_rate": 1.21797327382768e-06, + "loss": 0.0132, + "num_tokens": 175088467.0, + "reward": 2.325737476348877, + "reward_std": 1.2226120233535767, + "rewards/accuracy_reward/mean": 1.575737476348877, + "rewards/accuracy_reward/std": 3.1936593055725098, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 604.015625, + "completions/mean_terminated_length": 604.015625, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.6247734138972809, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05419663339853287, + "learning_rate": 1.2154180876053119e-06, + "loss": -0.0152, + "num_tokens": 175239604.0, + "reward": 5.418445587158203, + "reward_std": 2.887779712677002, + "rewards/accuracy_reward/mean": 4.672351837158203, + "rewards/accuracy_reward/std": 3.6084141731262207, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 540.5, + "completions/mean_terminated_length": 540.5, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.6253776435045317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058881547302007675, + "learning_rate": 1.2128646370489763e-06, + "loss": -0.0018, + "num_tokens": 175507604.0, + "reward": 2.8853330612182617, + "reward_std": 2.57232928276062, + "rewards/accuracy_reward/mean": 2.1353328227996826, + "rewards/accuracy_reward/std": 3.584435224533081, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 501.609375, + "completions/mean_terminated_length": 501.609375, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.6259818731117824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040337566286325455, + "learning_rate": 1.2103129323568353e-06, + "loss": -0.0252, + "num_tokens": 175680187.0, + "reward": 5.199524879455566, + "reward_std": 1.7857414484024048, + "rewards/accuracy_reward/mean": 4.449524879455566, + "rewards/accuracy_reward/std": 3.8766791820526123, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 564.125, + "completions/mean_terminated_length": 564.125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.6265861027190333, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.028870193287730217, + "learning_rate": 1.2077629837200813e-06, + "loss": 0.0161, + "num_tokens": 175848243.0, + "reward": 4.855381011962891, + "reward_std": 0.9544916749000549, + "rewards/accuracy_reward/mean": 4.105381011962891, + "rewards/accuracy_reward/std": 3.8167030811309814, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 623.53125, + "completions/mean_terminated_length": 623.53125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.627190332326284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024547534063458443, + "learning_rate": 1.2052148013228906e-06, + "loss": -0.0059, + "num_tokens": 176006661.0, + "reward": 3.192044973373413, + "reward_std": 1.1653889417648315, + "rewards/accuracy_reward/mean": 2.442045211791992, + "rewards/accuracy_reward/std": 3.2767839431762695, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 519.890625, + "completions/mean_terminated_length": 519.890625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.6277945619335348, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.00023002459784038365, + "learning_rate": 1.2026683953423861e-06, + "loss": -0.0, + "num_tokens": 176148734.0, + "reward": 2.611631393432617, + "reward_std": 0.007745692972093821, + "rewards/accuracy_reward/mean": 1.861631155014038, + "rewards/accuracy_reward/std": 3.249965190887451, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 610.859375, + "completions/mean_terminated_length": 564.5, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.6283987915407855, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0351201556622982, + "learning_rate": 1.2001237759485968e-06, + "loss": -0.014, + "num_tokens": 176302485.0, + "reward": 4.354859352111816, + "reward_std": 1.4736719131469727, + "rewards/accuracy_reward/mean": 3.6282968521118164, + "rewards/accuracy_reward/std": 4.017599105834961, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 618.03125, + "completions/mean_terminated_length": 595.3333740234375, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.6290030211480363, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04901060461997986, + "learning_rate": 1.1975809533044154e-06, + "loss": -0.0239, + "num_tokens": 176551271.0, + "reward": 2.8188374042510986, + "reward_std": 2.39467453956604, + "rewards/accuracy_reward/mean": 2.0805561542510986, + "rewards/accuracy_reward/std": 3.393872022628784, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1249.0, + "completions/mean_length": 638.9375, + "completions/mean_terminated_length": 593.4838256835938, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.629607250755287, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.015489859506487846, + "learning_rate": 1.195039937565559e-06, + "loss": -0.0173, + "num_tokens": 176720179.0, + "reward": 6.166668891906738, + "reward_std": 0.6318379640579224, + "rewards/accuracy_reward/mean": 5.440106391906738, + "rewards/accuracy_reward/std": 3.373222589492798, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 559.359375, + "completions/mean_terminated_length": 559.359375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.6302114803625377, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04149079695343971, + "learning_rate": 1.1925007388805277e-06, + "loss": 0.0124, + "num_tokens": 176871178.0, + "reward": 7.150429725646973, + "reward_std": 2.4965548515319824, + "rewards/accuracy_reward/mean": 6.400429725646973, + "rewards/accuracy_reward/std": 2.6360793113708496, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 622.09375, + "completions/mean_terminated_length": 622.09375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.6308157099697885, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022459926083683968, + "learning_rate": 1.189963367390565e-06, + "loss": -0.0083, + "num_tokens": 177006112.0, + "reward": 2.271134376525879, + "reward_std": 0.8751441240310669, + "rewards/accuracy_reward/mean": 1.521134376525879, + "rewards/accuracy_reward/std": 3.0363383293151855, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 596.703125, + "completions/mean_terminated_length": 596.703125, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.6314199395770392, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03897286206483841, + "learning_rate": 1.187427833229617e-06, + "loss": 0.018, + "num_tokens": 177146541.0, + "reward": 3.4208779335021973, + "reward_std": 1.7935292720794678, + "rewards/accuracy_reward/mean": 2.6708779335021973, + "rewards/accuracy_reward/std": 3.49749493598938, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 679.734375, + "completions/mean_terminated_length": 658.0159301757812, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.63202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03431885316967964, + "learning_rate": 1.1848941465242903e-06, + "loss": -0.0254, + "num_tokens": 177328924.0, + "reward": 4.746877670288086, + "reward_std": 1.2501744031906128, + "rewards/accuracy_reward/mean": 4.008596420288086, + "rewards/accuracy_reward/std": 3.6285691261291504, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1235.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 623.25, + "completions/mean_terminated_length": 623.25, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.6326283987915408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.031868286430835724, + "learning_rate": 1.182362317393815e-06, + "loss": 0.0006, + "num_tokens": 177459900.0, + "reward": 5.267256259918213, + "reward_std": 1.0317326784133911, + "rewards/accuracy_reward/mean": 4.521162986755371, + "rewards/accuracy_reward/std": 3.6828622817993164, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 537.0625, + "completions/mean_terminated_length": 537.0625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.6332326283987916, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.043146342039108276, + "learning_rate": 1.1798323559500007e-06, + "loss": -0.0042, + "num_tokens": 177627376.0, + "reward": 2.2508702278137207, + "reward_std": 1.785712718963623, + "rewards/accuracy_reward/mean": 1.5008702278137207, + "rewards/accuracy_reward/std": 2.8519058227539062, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 769.0, + "completions/max_terminated_length": 769.0, + "completions/mean_length": 539.890625, + "completions/mean_terminated_length": 539.890625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.6338368580060423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04595014452934265, + "learning_rate": 1.1773042722971982e-06, + "loss": -0.0119, + "num_tokens": 177818937.0, + "reward": 4.602427959442139, + "reward_std": 2.3587968349456787, + "rewards/accuracy_reward/mean": 3.8524279594421387, + "rewards/accuracy_reward/std": 3.736192226409912, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 579.0, + "completions/mean_terminated_length": 579.0, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.6344410876132931, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03263707831501961, + "learning_rate": 1.1747780765322597e-06, + "loss": 0.0011, + "num_tokens": 177971721.0, + "reward": 4.781989097595215, + "reward_std": 1.5945956707000732, + "rewards/accuracy_reward/mean": 4.031989097595215, + "rewards/accuracy_reward/std": 3.726534843444824, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 470.65625, + "completions/mean_terminated_length": 470.65625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.6350453172205438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.004923512693494558, + "learning_rate": 1.1722537787444954e-06, + "loss": -0.0009, + "num_tokens": 178120131.0, + "reward": 6.242105007171631, + "reward_std": 0.13629205524921417, + "rewards/accuracy_reward/mean": 5.492104530334473, + "rewards/accuracy_reward/std": 3.348893880844116, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 909.0, + "completions/mean_length": 557.875, + "completions/mean_terminated_length": 534.2222290039062, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.6356495468277945, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04666712507605553, + "learning_rate": 1.169731389015637e-06, + "loss": -0.0104, + "num_tokens": 178440363.0, + "reward": 2.5384531021118164, + "reward_std": 1.574561595916748, + "rewards/accuracy_reward/mean": 1.8001718521118164, + "rewards/accuracy_reward/std": 3.2201499938964844, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.0, + "completions/max_terminated_length": 1071.0, + "completions/mean_length": 563.9375, + "completions/mean_terminated_length": 563.9375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.6362537764350453, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028495581820607185, + "learning_rate": 1.167210917419794e-06, + "loss": -0.0108, + "num_tokens": 178636599.0, + "reward": 5.869410991668701, + "reward_std": 1.2243390083312988, + "rewards/accuracy_reward/mean": 5.119410991668701, + "rewards/accuracy_reward/std": 3.478886604309082, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 671.0, + "completions/max_terminated_length": 671.0, + "completions/mean_length": 451.015625, + "completions/mean_terminated_length": 451.015625, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.636858006042296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029531436040997505, + "learning_rate": 1.1646923740234174e-06, + "loss": -0.006, + "num_tokens": 178768968.0, + "reward": 5.286941051483154, + "reward_std": 1.0177090167999268, + "rewards/accuracy_reward/mean": 4.536940574645996, + "rewards/accuracy_reward/std": 3.6696343421936035, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1925.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 627.1875, + "completions/mean_terminated_length": 627.1875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.6374622356495468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05461385101079941, + "learning_rate": 1.162175768885255e-06, + "loss": 0.037, + "num_tokens": 178936068.0, + "reward": 5.971179962158203, + "reward_std": 1.8825230598449707, + "rewards/accuracy_reward/mean": 5.221179485321045, + "rewards/accuracy_reward/std": 3.456235647201538, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 478.875, + "completions/mean_terminated_length": 478.875, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.6380664652567976, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.032571692019701004, + "learning_rate": 1.159661112056314e-06, + "loss": 0.008, + "num_tokens": 179159612.0, + "reward": 5.501473426818848, + "reward_std": 1.5916810035705566, + "rewards/accuracy_reward/mean": 4.751473426818848, + "rewards/accuracy_reward/std": 3.5871224403381348, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 565.828125, + "completions/mean_terminated_length": 565.828125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.6386706948640484, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05392301082611084, + "learning_rate": 1.1571484135798212e-06, + "loss": 0.0074, + "num_tokens": 179312433.0, + "reward": 6.528254508972168, + "reward_std": 2.351699113845825, + "rewards/accuracy_reward/mean": 5.778254508972168, + "rewards/accuracy_reward/std": 3.149013042449951, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1080.0, + "completions/max_terminated_length": 1080.0, + "completions/mean_length": 662.421875, + "completions/mean_terminated_length": 662.421875, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.6392749244712991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04205012321472168, + "learning_rate": 1.1546376834911812e-06, + "loss": -0.0067, + "num_tokens": 179464652.0, + "reward": 2.9029064178466797, + "reward_std": 2.003549814224243, + "rewards/accuracy_reward/mean": 2.1529061794281006, + "rewards/accuracy_reward/std": 3.462360143661499, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 852.0, + "completions/max_terminated_length": 852.0, + "completions/mean_length": 497.734375, + "completions/mean_terminated_length": 497.734375, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.6398791540785499, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.021556556224822998, + "learning_rate": 1.1521289318179371e-06, + "loss": 0.0003, + "num_tokens": 179636603.0, + "reward": 6.066145420074463, + "reward_std": 0.9399792551994324, + "rewards/accuracy_reward/mean": 5.316145420074463, + "rewards/accuracy_reward/std": 3.3525545597076416, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 619.390625, + "completions/mean_terminated_length": 619.390625, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.6404833836858006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05337924510240555, + "learning_rate": 1.1496221685797313e-06, + "loss": 0.0224, + "num_tokens": 179771412.0, + "reward": 2.6910157203674316, + "reward_std": 2.793199062347412, + "rewards/accuracy_reward/mean": 1.941015601158142, + "rewards/accuracy_reward/std": 3.3288497924804688, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 438.171875, + "completions/mean_terminated_length": 438.171875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.6410876132930513, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0009707827121019363, + "learning_rate": 1.1471174037882628e-06, + "loss": -0.0007, + "num_tokens": 179947727.0, + "reward": 6.315287113189697, + "reward_std": 0.04981183260679245, + "rewards/accuracy_reward/mean": 5.565286636352539, + "rewards/accuracy_reward/std": 3.2394707202911377, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 518.03125, + "completions/mean_terminated_length": 518.03125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.6416918429003021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045191846787929535, + "learning_rate": 1.144614647447251e-06, + "loss": 0.0119, + "num_tokens": 180095857.0, + "reward": 5.633074760437012, + "reward_std": 2.355419158935547, + "rewards/accuracy_reward/mean": 4.883074760437012, + "rewards/accuracy_reward/std": 3.4636359214782715, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1467.0, + "completions/mean_length": 642.09375, + "completions/mean_terminated_length": 619.77783203125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.6422960725075528, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03170665726065636, + "learning_rate": 1.1421139095523927e-06, + "loss": 0.0284, + "num_tokens": 180257287.0, + "reward": 4.091273307800293, + "reward_std": 1.380185842514038, + "rewards/accuracy_reward/mean": 3.352992296218872, + "rewards/accuracy_reward/std": 3.724553346633911, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 596.09375, + "completions/mean_terminated_length": 596.09375, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.6429003021148036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04168995842337608, + "learning_rate": 1.1396152000913234e-06, + "loss": -0.0211, + "num_tokens": 180412717.0, + "reward": 4.117537498474121, + "reward_std": 1.8256511688232422, + "rewards/accuracy_reward/mean": 3.367537498474121, + "rewards/accuracy_reward/std": 3.7288591861724854, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 534.828125, + "completions/mean_terminated_length": 510.8095397949219, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.6435045317220544, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0433335080742836, + "learning_rate": 1.1371185290435784e-06, + "loss": 0.0149, + "num_tokens": 180575746.0, + "reward": 5.812200546264648, + "reward_std": 1.863907814025879, + "rewards/accuracy_reward/mean": 5.07391881942749, + "rewards/accuracy_reward/std": 3.484328031539917, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 398.625, + "completions/mean_terminated_length": 398.625, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.6441087613293052, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03513665497303009, + "learning_rate": 1.13462390638055e-06, + "loss": 0.0192, + "num_tokens": 180703722.0, + "reward": 5.144487380981445, + "reward_std": 0.94181889295578, + "rewards/accuracy_reward/mean": 4.394487380981445, + "rewards/accuracy_reward/std": 3.663947105407715, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1216.0, + "completions/max_terminated_length": 1216.0, + "completions/mean_length": 629.296875, + "completions/mean_terminated_length": 629.296875, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.6447129909365559, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030430760234594345, + "learning_rate": 1.1321313420654506e-06, + "loss": 0.0341, + "num_tokens": 180849917.0, + "reward": 4.226607799530029, + "reward_std": 1.2674281597137451, + "rewards/accuracy_reward/mean": 3.4766077995300293, + "rewards/accuracy_reward/std": 3.5734059810638428, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 547.140625, + "completions/mean_terminated_length": 547.140625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6453172205438067, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06053408980369568, + "learning_rate": 1.1296408460532715e-06, + "loss": -0.002, + "num_tokens": 181008422.0, + "reward": 5.520524978637695, + "reward_std": 3.445199728012085, + "rewards/accuracy_reward/mean": 4.770524978637695, + "rewards/accuracy_reward/std": 3.601625442504883, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 506.109375, + "completions/mean_terminated_length": 506.109375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.6459214501510574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03220128268003464, + "learning_rate": 1.1271524282907447e-06, + "loss": -0.0137, + "num_tokens": 181144589.0, + "reward": 7.369865894317627, + "reward_std": 1.4613310098648071, + "rewards/accuracy_reward/mean": 6.619865417480469, + "rewards/accuracy_reward/std": 2.175732374191284, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 536.5, + "completions/mean_terminated_length": 536.5, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.6465256797583081, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.040016815066337585, + "learning_rate": 1.1246660987162994e-06, + "loss": -0.0356, + "num_tokens": 181343613.0, + "reward": 4.243228435516357, + "reward_std": 1.6477526426315308, + "rewards/accuracy_reward/mean": 3.493227958679199, + "rewards/accuracy_reward/std": 3.748274803161621, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 585.21875, + "completions/mean_terminated_length": 585.21875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.6471299093655589, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03220610320568085, + "learning_rate": 1.1221818672600268e-06, + "loss": -0.0061, + "num_tokens": 181498667.0, + "reward": 5.610720634460449, + "reward_std": 1.4185378551483154, + "rewards/accuracy_reward/mean": 4.860720634460449, + "rewards/accuracy_reward/std": 3.565992593765259, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 499.15625, + "completions/mean_terminated_length": 499.15625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.6477341389728096, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03973077982664108, + "learning_rate": 1.1196997438436381e-06, + "loss": -0.0026, + "num_tokens": 181724613.0, + "reward": 4.395264148712158, + "reward_std": 1.566811203956604, + "rewards/accuracy_reward/mean": 3.645264148712158, + "rewards/accuracy_reward/std": 3.807506561279297, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 650.484375, + "completions/mean_terminated_length": 650.484375, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.6483383685800604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.023936325684189796, + "learning_rate": 1.117219738380425e-06, + "loss": 0.0032, + "num_tokens": 181901588.0, + "reward": 4.379512310028076, + "reward_std": 1.153661847114563, + "rewards/accuracy_reward/mean": 3.6295125484466553, + "rewards/accuracy_reward/std": 3.7339870929718018, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 632.328125, + "completions/mean_terminated_length": 632.328125, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.6489425981873111, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051407843828201294, + "learning_rate": 1.1147418607752208e-06, + "loss": 0.0318, + "num_tokens": 182053689.0, + "reward": 4.7005205154418945, + "reward_std": 2.3084752559661865, + "rewards/accuracy_reward/mean": 3.9505205154418945, + "rewards/accuracy_reward/std": 3.7091968059539795, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 598.0625, + "completions/mean_terminated_length": 598.0625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.649546827794562, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04533200338482857, + "learning_rate": 1.1122661209243584e-06, + "loss": 0.0061, + "num_tokens": 182194445.0, + "reward": 7.020618438720703, + "reward_std": 2.040374755859375, + "rewards/accuracy_reward/mean": 6.270618438720703, + "rewards/accuracy_reward/std": 2.725231885910034, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1187.0, + "completions/max_terminated_length": 1187.0, + "completions/mean_length": 633.75, + "completions/mean_terminated_length": 633.75, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.6501510574018127, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020143093541264534, + "learning_rate": 1.1097925287156365e-06, + "loss": -0.0114, + "num_tokens": 182394077.0, + "reward": 4.670576572418213, + "reward_std": 0.6535836458206177, + "rewards/accuracy_reward/mean": 3.920576572418213, + "rewards/accuracy_reward/std": 3.7122325897216797, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 590.734375, + "completions/mean_terminated_length": 590.734375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.6507552870090635, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0332554392516613, + "learning_rate": 1.1073210940282734e-06, + "loss": -0.0097, + "num_tokens": 182550396.0, + "reward": 1.3289953470230103, + "reward_std": 1.7995080947875977, + "rewards/accuracy_reward/mean": 0.5789953470230103, + "rewards/accuracy_reward/std": 2.025757074356079, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 686.0, + "completions/max_terminated_length": 686.0, + "completions/mean_length": 472.328125, + "completions/mean_terminated_length": 472.328125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.6513595166163142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.031169448047876358, + "learning_rate": 1.1048518267328713e-06, + "loss": -0.0094, + "num_tokens": 182685889.0, + "reward": 5.9820733070373535, + "reward_std": 1.2490856647491455, + "rewards/accuracy_reward/mean": 5.232073783874512, + "rewards/accuracy_reward/std": 3.4829068183898926, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 601.796875, + "completions/mean_terminated_length": 601.796875, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "epoch": 0.6519637462235649, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03846057131886482, + "learning_rate": 1.1023847366913766e-06, + "loss": -0.0059, + "num_tokens": 182828052.0, + "reward": 7.341841697692871, + "reward_std": 1.8173778057098389, + "rewards/accuracy_reward/mean": 6.591842174530029, + "rewards/accuracy_reward/std": 2.304743528366089, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 400.5, + "completions/mean_terminated_length": 400.5, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.6525679758308157, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04379420354962349, + "learning_rate": 1.0999198337570392e-06, + "loss": 0.0021, + "num_tokens": 183061444.0, + "reward": 4.439061164855957, + "reward_std": 2.108471393585205, + "rewards/accuracy_reward/mean": 3.689061164855957, + "rewards/accuracy_reward/std": 3.738330125808716, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 461.328125, + "completions/mean_terminated_length": 461.328125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.6531722054380664, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04790308326482773, + "learning_rate": 1.0974571277743746e-06, + "loss": -0.039, + "num_tokens": 183249817.0, + "reward": 3.2531704902648926, + "reward_std": 2.35019850730896, + "rewards/accuracy_reward/mean": 2.5031702518463135, + "rewards/accuracy_reward/std": 3.4849774837493896, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 456.09375, + "completions/mean_terminated_length": 456.09375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.6537764350453172, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05467239394783974, + "learning_rate": 1.0949966285791238e-06, + "loss": 0.0247, + "num_tokens": 183370463.0, + "reward": 5.397615909576416, + "reward_std": 3.058600902557373, + "rewards/accuracy_reward/mean": 4.647615432739258, + "rewards/accuracy_reward/std": 3.5736145973205566, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 605.625, + "completions/mean_terminated_length": 582.7301635742188, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.654380664652568, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04860894754528999, + "learning_rate": 1.0925383459982143e-06, + "loss": -0.0654, + "num_tokens": 183497255.0, + "reward": 5.353752136230469, + "reward_std": 1.734656810760498, + "rewards/accuracy_reward/mean": 4.615470886230469, + "rewards/accuracy_reward/std": 3.672515869140625, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 530.484375, + "completions/mean_terminated_length": 506.3968505859375, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.6549848942598188, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05160655081272125, + "learning_rate": 1.0900822898497206e-06, + "loss": -0.0162, + "num_tokens": 183726390.0, + "reward": 3.2107672691345215, + "reward_std": 2.349024772644043, + "rewards/accuracy_reward/mean": 2.4724860191345215, + "rewards/accuracy_reward/std": 3.6189095973968506, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 605.953125, + "completions/mean_terminated_length": 605.953125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.6555891238670695, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041458550840616226, + "learning_rate": 1.0876284699428248e-06, + "loss": 0.0136, + "num_tokens": 183861795.0, + "reward": 3.975614070892334, + "reward_std": 1.3971253633499146, + "rewards/accuracy_reward/mean": 3.225614070892334, + "rewards/accuracy_reward/std": 3.5084316730499268, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 538.53125, + "completions/mean_terminated_length": 538.53125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.6561933534743203, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0019404975464567542, + "learning_rate": 1.0851768960777784e-06, + "loss": 0.0, + "num_tokens": 184014469.0, + "reward": 4.521014213562012, + "reward_std": 0.07492055743932724, + "rewards/accuracy_reward/mean": 3.7710142135620117, + "rewards/accuracy_reward/std": 3.689505100250244, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 708.0, + "completions/max_terminated_length": 708.0, + "completions/mean_length": 481.78125, + "completions/mean_terminated_length": 481.78125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.656797583081571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03254970908164978, + "learning_rate": 1.082727578045861e-06, + "loss": 0.0167, + "num_tokens": 184163255.0, + "reward": 5.4951372146606445, + "reward_std": 1.5202960968017578, + "rewards/accuracy_reward/mean": 4.7451372146606445, + "rewards/accuracy_reward/std": 3.5831830501556396, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 565.5, + "completions/mean_terminated_length": 565.5, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.6574018126888218, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02833111397922039, + "learning_rate": 1.0802805256293453e-06, + "loss": -0.0121, + "num_tokens": 184332103.0, + "reward": 2.0236124992370605, + "reward_std": 1.163029432296753, + "rewards/accuracy_reward/mean": 1.2736124992370605, + "rewards/accuracy_reward/std": 2.858699321746826, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1306.0, + "completions/max_terminated_length": 1306.0, + "completions/mean_length": 578.796875, + "completions/mean_terminated_length": 578.796875, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.6580060422960725, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02044396661221981, + "learning_rate": 1.0778357486014526e-06, + "loss": 0.0003, + "num_tokens": 184506906.0, + "reward": 4.239712238311768, + "reward_std": 0.664071798324585, + "rewards/accuracy_reward/mean": 3.4897124767303467, + "rewards/accuracy_reward/std": 3.7479982376098633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 524.21875, + "completions/mean_terminated_length": 524.21875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.6586102719033232, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05295668542385101, + "learning_rate": 1.0753932567263185e-06, + "loss": 0.0153, + "num_tokens": 184641816.0, + "reward": 2.464235782623291, + "reward_std": 2.3461434841156006, + "rewards/accuracy_reward/mean": 1.7142359018325806, + "rewards/accuracy_reward/std": 3.162766218185425, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 573.84375, + "completions/mean_terminated_length": 573.84375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.659214501510574, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030997702851891518, + "learning_rate": 1.0729530597589513e-06, + "loss": -0.0021, + "num_tokens": 184823806.0, + "reward": 5.749734401702881, + "reward_std": 1.395470142364502, + "rewards/accuracy_reward/mean": 4.999734401702881, + "rewards/accuracy_reward/std": 3.521714687347412, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 559.84375, + "completions/mean_terminated_length": 559.84375, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "epoch": 0.6598187311178247, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018981216475367546, + "learning_rate": 1.0705151674451938e-06, + "loss": 0.0016, + "num_tokens": 185002500.0, + "reward": 4.705643653869629, + "reward_std": 0.6486063003540039, + "rewards/accuracy_reward/mean": 3.955643653869629, + "rewards/accuracy_reward/std": 3.745123863220215, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 585.0, + "completions/mean_terminated_length": 585.0, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.6604229607250756, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04955261945724487, + "learning_rate": 1.0680795895216846e-06, + "loss": 0.031, + "num_tokens": 185166340.0, + "reward": 2.8256516456604004, + "reward_std": 1.961263656616211, + "rewards/accuracy_reward/mean": 2.0756516456604004, + "rewards/accuracy_reward/std": 3.3732593059539795, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 518.796875, + "completions/mean_terminated_length": 518.796875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.6610271903323263, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03834668546915054, + "learning_rate": 1.0656463357158164e-06, + "loss": 0.0372, + "num_tokens": 185344295.0, + "reward": 5.131226539611816, + "reward_std": 1.7596832513809204, + "rewards/accuracy_reward/mean": 4.385132312774658, + "rewards/accuracy_reward/std": 3.6614551544189453, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 940.0, + "completions/max_terminated_length": 940.0, + "completions/mean_length": 626.171875, + "completions/mean_terminated_length": 626.171875, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.6616314199395771, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0016792021924629807, + "learning_rate": 1.063215415745705e-06, + "loss": 0.0004, + "num_tokens": 185473202.0, + "reward": 2.6896281242370605, + "reward_std": 0.059410277754068375, + "rewards/accuracy_reward/mean": 1.9396281242370605, + "rewards/accuracy_reward/std": 3.217794895172119, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.0, + "completions/max_terminated_length": 733.0, + "completions/mean_length": 517.09375, + "completions/mean_terminated_length": 517.09375, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.6622356495468278, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.043224435299634933, + "learning_rate": 1.0607868393201406e-06, + "loss": 0.0237, + "num_tokens": 185653544.0, + "reward": 2.579817295074463, + "reward_std": 1.6627928018569946, + "rewards/accuracy_reward/mean": 1.833723545074463, + "rewards/accuracy_reward/std": 3.1330678462982178, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 581.296875, + "completions/mean_terminated_length": 581.296875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.6628398791540786, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04711684584617615, + "learning_rate": 1.0583606161385542e-06, + "loss": 0.004, + "num_tokens": 185824347.0, + "reward": 2.924738883972168, + "reward_std": 2.037982225418091, + "rewards/accuracy_reward/mean": 2.174738883972168, + "rewards/accuracy_reward/std": 3.4379618167877197, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 476.953125, + "completions/mean_terminated_length": 476.953125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.6634441087613293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06241033971309662, + "learning_rate": 1.0559367558909806e-06, + "loss": 0.0325, + "num_tokens": 186073720.0, + "reward": 2.915842056274414, + "reward_std": 3.3803396224975586, + "rewards/accuracy_reward/mean": 2.165842056274414, + "rewards/accuracy_reward/std": 3.4329893589019775, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 631.609375, + "completions/mean_terminated_length": 609.1270141601562, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.66404833836858, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05346588417887688, + "learning_rate": 1.0535152682580146e-06, + "loss": -0.0346, + "num_tokens": 186270959.0, + "reward": 1.9651484489440918, + "reward_std": 2.1465048789978027, + "rewards/accuracy_reward/mean": 1.2268671989440918, + "rewards/accuracy_reward/std": 3.0406928062438965, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 496.359375, + "completions/mean_terminated_length": 471.7301940917969, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.6646525679758308, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03034082241356373, + "learning_rate": 1.0510961629107764e-06, + "loss": -0.0133, + "num_tokens": 186403686.0, + "reward": 3.798699140548706, + "reward_std": 1.5331239700317383, + "rewards/accuracy_reward/mean": 3.060417652130127, + "rewards/accuracy_reward/std": 3.6055190563201904, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 593.53125, + "completions/mean_terminated_length": 593.53125, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "epoch": 0.6652567975830815, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03094199113547802, + "learning_rate": 1.0486794495108713e-06, + "loss": 0.0021, + "num_tokens": 186657560.0, + "reward": 6.009720325469971, + "reward_std": 1.0723977088928223, + "rewards/accuracy_reward/mean": 5.259720325469971, + "rewards/accuracy_reward/std": 3.4158732891082764, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 565.125, + "completions/mean_terminated_length": 541.5873413085938, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.6658610271903324, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04705759510397911, + "learning_rate": 1.046265137710352e-06, + "loss": -0.0135, + "num_tokens": 186807184.0, + "reward": 5.944365501403809, + "reward_std": 2.826533794403076, + "rewards/accuracy_reward/mean": 5.206084251403809, + "rewards/accuracy_reward/std": 3.457965850830078, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 528.65625, + "completions/mean_terminated_length": 528.65625, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.6664652567975831, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03165678679943085, + "learning_rate": 1.0438532371516794e-06, + "loss": 0.0137, + "num_tokens": 186966858.0, + "reward": 7.680995464324951, + "reward_std": 0.9421070218086243, + "rewards/accuracy_reward/mean": 6.930995464324951, + "rewards/accuracy_reward/std": 2.002134323120117, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 506.578125, + "completions/mean_terminated_length": 482.11114501953125, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.6670694864048339, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0290814358741045, + "learning_rate": 1.0414437574676832e-06, + "loss": -0.0096, + "num_tokens": 187138319.0, + "reward": 4.792103290557861, + "reward_std": 1.4146559238433838, + "rewards/accuracy_reward/mean": 4.053822040557861, + "rewards/accuracy_reward/std": 3.761322259902954, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 935.0, + "completions/max_terminated_length": 935.0, + "completions/mean_length": 579.03125, + "completions/mean_terminated_length": 579.03125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.6676737160120846, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.047312501817941666, + "learning_rate": 1.0390367082815259e-06, + "loss": 0.0031, + "num_tokens": 187284609.0, + "reward": 2.78650164604187, + "reward_std": 2.1746575832366943, + "rewards/accuracy_reward/mean": 2.03650164604187, + "rewards/accuracy_reward/std": 3.429924726486206, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 447.515625, + "completions/mean_terminated_length": 447.515625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.6682779456193354, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034569088369607925, + "learning_rate": 1.0366320992066615e-06, + "loss": -0.027, + "num_tokens": 187429058.0, + "reward": 4.432764053344727, + "reward_std": 1.3110898733139038, + "rewards/accuracy_reward/mean": 3.6827640533447266, + "rewards/accuracy_reward/std": 3.7459280490875244, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 1289.0, + "completions/mean_length": 591.625, + "completions/mean_terminated_length": 591.625, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.6688821752265861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.038694266229867935, + "learning_rate": 1.0342299398467992e-06, + "loss": 0.0016, + "num_tokens": 187568842.0, + "reward": 1.4397592544555664, + "reward_std": 1.8802200555801392, + "rewards/accuracy_reward/mean": 0.689759373664856, + "rewards/accuracy_reward/std": 2.2441532611846924, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 528.671875, + "completions/mean_terminated_length": 528.671875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.6694864048338368, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022910287603735924, + "learning_rate": 1.0318302397958647e-06, + "loss": 0.0, + "num_tokens": 187720373.0, + "reward": 4.586979389190674, + "reward_std": 1.1085734367370605, + "rewards/accuracy_reward/mean": 3.836979866027832, + "rewards/accuracy_reward/std": 3.748340129852295, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 492.765625, + "completions/mean_terminated_length": 492.765625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6700906344410876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.013244211673736572, + "learning_rate": 1.0294330086379612e-06, + "loss": -0.0032, + "num_tokens": 187865414.0, + "reward": 6.222612380981445, + "reward_std": 0.5200967192649841, + "rewards/accuracy_reward/mean": 5.472611904144287, + "rewards/accuracy_reward/std": 3.301236629486084, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.0, + "completions/max_terminated_length": 748.0, + "completions/mean_length": 499.578125, + "completions/mean_terminated_length": 499.578125, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.6706948640483383, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05292810872197151, + "learning_rate": 1.0270382559473312e-06, + "loss": -0.0019, + "num_tokens": 188007243.0, + "reward": 6.840279579162598, + "reward_std": 1.7353177070617676, + "rewards/accuracy_reward/mean": 6.090279579162598, + "rewards/accuracy_reward/std": 2.841427803039551, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 536.9375, + "completions/mean_terminated_length": 536.9375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.6712990936555892, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02824885956943035, + "learning_rate": 1.024645991288318e-06, + "loss": -0.0048, + "num_tokens": 188198583.0, + "reward": 2.7280373573303223, + "reward_std": 1.1017073392868042, + "rewards/accuracy_reward/mean": 1.9780375957489014, + "rewards/accuracy_reward/std": 3.314985990524292, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 528.484375, + "completions/mean_terminated_length": 528.484375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.6719033232628399, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053946927189826965, + "learning_rate": 1.02225622421533e-06, + "loss": 0.0539, + "num_tokens": 188339878.0, + "reward": 6.260198593139648, + "reward_std": 1.9512572288513184, + "rewards/accuracy_reward/mean": 5.510198593139648, + "rewards/accuracy_reward/std": 3.3037326335906982, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1597.0, + "completions/max_terminated_length": 1597.0, + "completions/mean_length": 792.71875, + "completions/mean_terminated_length": 792.71875, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "epoch": 0.6725075528700907, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.046819016337394714, + "learning_rate": 1.0198689642727986e-06, + "loss": 0.0968, + "num_tokens": 188502260.0, + "reward": 2.293665885925293, + "reward_std": 1.6764682531356812, + "rewards/accuracy_reward/mean": 1.5436656475067139, + "rewards/accuracy_reward/std": 2.9957640171051025, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 647.96875, + "completions/mean_terminated_length": 625.74609375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.6731117824773414, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.031275052577257156, + "learning_rate": 1.017484220995142e-06, + "loss": 0.0134, + "num_tokens": 188733778.0, + "reward": 2.3098483085632324, + "reward_std": 1.32057523727417, + "rewards/accuracy_reward/mean": 1.571567177772522, + "rewards/accuracy_reward/std": 3.074660062789917, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 576.953125, + "completions/mean_terminated_length": 576.953125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.6737160120845922, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032147251069545746, + "learning_rate": 1.0151020039067293e-06, + "loss": 0.0001, + "num_tokens": 188885983.0, + "reward": 4.693493843078613, + "reward_std": 1.704464316368103, + "rewards/accuracy_reward/mean": 3.943493604660034, + "rewards/accuracy_reward/std": 3.6433184146881104, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 502.6875, + "completions/mean_terminated_length": 502.6875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.6743202416918429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0004268861666787416, + "learning_rate": 1.0127223225218379e-06, + "loss": -0.0005, + "num_tokens": 189036523.0, + "reward": 6.323220252990723, + "reward_std": 0.02460266649723053, + "rewards/accuracy_reward/mean": 5.573220252990723, + "rewards/accuracy_reward/std": 3.2434263229370117, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 765.0, + "completions/mean_length": 538.859375, + "completions/mean_terminated_length": 514.90478515625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.6749244712990936, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029174430295825005, + "learning_rate": 1.0103451863446184e-06, + "loss": -0.0056, + "num_tokens": 189182738.0, + "reward": 3.8847219944000244, + "reward_std": 0.9926830530166626, + "rewards/accuracy_reward/mean": 3.1464407444000244, + "rewards/accuracy_reward/std": 3.7163753509521484, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 891.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 529.15625, + "completions/mean_terminated_length": 529.15625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.6755287009063444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026826199144124985, + "learning_rate": 1.0079706048690577e-06, + "loss": 0.0058, + "num_tokens": 189320524.0, + "reward": 6.1122612953186035, + "reward_std": 1.601077675819397, + "rewards/accuracy_reward/mean": 5.362260818481445, + "rewards/accuracy_reward/std": 3.356468677520752, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 536.859375, + "completions/mean_terminated_length": 536.859375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.6761329305135951, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0484844334423542, + "learning_rate": 1.0055985875789381e-06, + "loss": 0.0223, + "num_tokens": 189449875.0, + "reward": 7.173917770385742, + "reward_std": 2.1305527687072754, + "rewards/accuracy_reward/mean": 6.423918724060059, + "rewards/accuracy_reward/std": 2.4982001781463623, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 587.3125, + "completions/mean_terminated_length": 587.3125, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.676737160120846, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06210639700293541, + "learning_rate": 1.0032291439478008e-06, + "loss": 0.0404, + "num_tokens": 189583271.0, + "reward": 4.835323333740234, + "reward_std": 3.2760019302368164, + "rewards/accuracy_reward/mean": 4.085323333740234, + "rewards/accuracy_reward/std": 3.7857251167297363, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1378.0, + "completions/max_terminated_length": 1378.0, + "completions/mean_length": 545.96875, + "completions/mean_terminated_length": 545.96875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.6773413897280967, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026452772319316864, + "learning_rate": 1.0008622834389087e-06, + "loss": -0.0031, + "num_tokens": 189884245.0, + "reward": 1.1125437021255493, + "reward_std": 1.1757752895355225, + "rewards/accuracy_reward/mean": 0.3625437319278717, + "rewards/accuracy_reward/std": 1.5800868272781372, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 503.46875, + "completions/mean_terminated_length": 503.46875, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.6779456193353475, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.020523525774478912, + "learning_rate": 9.984980155052087e-07, + "loss": 0.0017, + "num_tokens": 190037411.0, + "reward": 6.324872016906738, + "reward_std": 0.9478549957275391, + "rewards/accuracy_reward/mean": 5.574872016906738, + "rewards/accuracy_reward/std": 3.244264602661133, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 635.84375, + "completions/mean_terminated_length": 635.84375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.6785498489425982, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.010647764429450035, + "learning_rate": 9.961363495892917e-07, + "loss": 0.0007, + "num_tokens": 190196073.0, + "reward": 2.7776906490325928, + "reward_std": 0.2869144380092621, + "rewards/accuracy_reward/mean": 2.0276906490325928, + "rewards/accuracy_reward/std": 3.184967041015625, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 490.84375, + "completions/mean_terminated_length": 490.84375, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "epoch": 0.679154078549849, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037539634853601456, + "learning_rate": 9.93777295123357e-07, + "loss": 0.0112, + "num_tokens": 190351551.0, + "reward": 7.286023139953613, + "reward_std": 1.9604501724243164, + "rewards/accuracy_reward/mean": 6.536023139953613, + "rewards/accuracy_reward/std": 2.4441797733306885, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 576.828125, + "completions/mean_terminated_length": 576.828125, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.6797583081570997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0344264879822731, + "learning_rate": 9.914208615291753e-07, + "loss": 0.0036, + "num_tokens": 190593204.0, + "reward": 4.279862403869629, + "reward_std": 1.2258906364440918, + "rewards/accuracy_reward/mean": 3.529862403869629, + "rewards/accuracy_reward/std": 3.837162971496582, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 626.359375, + "completions/mean_terminated_length": 603.793701171875, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.6803625377643504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049970049411058426, + "learning_rate": 9.89067058218048e-07, + "loss": 0.0312, + "num_tokens": 190753643.0, + "reward": 3.7347655296325684, + "reward_std": 2.412990093231201, + "rewards/accuracy_reward/mean": 2.9964842796325684, + "rewards/accuracy_reward/std": 3.7194344997406006, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 587.875, + "completions/mean_terminated_length": 587.875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6809667673716012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0048362743109464645, + "learning_rate": 9.867158945907725e-07, + "loss": 0.003, + "num_tokens": 190909219.0, + "reward": 4.543928146362305, + "reward_std": 0.17991848289966583, + "rewards/accuracy_reward/mean": 3.7939281463623047, + "rewards/accuracy_reward/std": 3.6795575618743896, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 651.28125, + "completions/mean_terminated_length": 651.28125, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "epoch": 0.6815709969788519, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05036791041493416, + "learning_rate": 9.843673800376037e-07, + "loss": -0.0008, + "num_tokens": 191063365.0, + "reward": 5.330898284912109, + "reward_std": 2.4260988235473633, + "rewards/accuracy_reward/mean": 4.580898284912109, + "rewards/accuracy_reward/std": 3.6457815170288086, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 626.59375, + "completions/mean_terminated_length": 626.59375, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.6821752265861027, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02367953583598137, + "learning_rate": 9.820215239382166e-07, + "loss": 0.0109, + "num_tokens": 191207195.0, + "reward": 2.871398448944092, + "reward_std": 0.9339002966880798, + "rewards/accuracy_reward/mean": 2.121398448944092, + "rewards/accuracy_reward/std": 3.3619532585144043, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 516.390625, + "completions/mean_terminated_length": 516.390625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.6827794561933535, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03567865118384361, + "learning_rate": 9.796783356616676e-07, + "loss": -0.0104, + "num_tokens": 191483764.0, + "reward": 4.19475793838501, + "reward_std": 1.3019901514053345, + "rewards/accuracy_reward/mean": 3.4447579383850098, + "rewards/accuracy_reward/std": 3.763537645339966, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 566.6875, + "completions/mean_terminated_length": 543.1746215820312, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.6833836858006043, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004039437975734472, + "learning_rate": 9.773378245663586e-07, + "loss": -0.0064, + "num_tokens": 191640560.0, + "reward": 4.35329532623291, + "reward_std": 0.15730832517147064, + "rewards/accuracy_reward/mean": 3.61501407623291, + "rewards/accuracy_reward/std": 3.810403347015381, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1223.0, + "completions/max_terminated_length": 1223.0, + "completions/mean_length": 621.8125, + "completions/mean_terminated_length": 621.8125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.683987915407855, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006503232289105654, + "learning_rate": 9.750000000000004e-07, + "loss": -0.0027, + "num_tokens": 191798148.0, + "reward": 2.5791187286376953, + "reward_std": 0.1892385482788086, + "rewards/accuracy_reward/mean": 1.8291187286376953, + "rewards/accuracy_reward/std": 3.265423536300659, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1222.0, + "completions/max_terminated_length": 1222.0, + "completions/mean_length": 530.140625, + "completions/mean_terminated_length": 530.140625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6845921450151058, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.034825727343559265, + "learning_rate": 9.726648712995726e-07, + "loss": -0.0126, + "num_tokens": 191963181.0, + "reward": 1.2267796993255615, + "reward_std": 1.5472898483276367, + "rewards/accuracy_reward/mean": 0.47677966952323914, + "rewards/accuracy_reward/std": 2.2876696586608887, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 457.984375, + "completions/mean_terminated_length": 457.984375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.6851963746223565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05580431967973709, + "learning_rate": 9.70332447791288e-07, + "loss": -0.0196, + "num_tokens": 192126012.0, + "reward": 4.536935806274414, + "reward_std": 2.658531665802002, + "rewards/accuracy_reward/mean": 3.786936044692993, + "rewards/accuracy_reward/std": 3.7665488719940186, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 1430.0, + "completions/mean_length": 627.9375, + "completions/mean_terminated_length": 627.9375, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.6858006042296072, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0637989342212677, + "learning_rate": 9.68002738790556e-07, + "loss": 0.0156, + "num_tokens": 192308456.0, + "reward": 5.773385047912598, + "reward_std": 2.8188700675964355, + "rewards/accuracy_reward/mean": 5.023385047912598, + "rewards/accuracy_reward/std": 3.500016927719116, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 656.90625, + "completions/mean_terminated_length": 656.90625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.686404833836858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05504593998193741, + "learning_rate": 9.65675753601945e-07, + "loss": 0.0089, + "num_tokens": 192489474.0, + "reward": 5.832942485809326, + "reward_std": 2.685791492462158, + "rewards/accuracy_reward/mean": 5.082942008972168, + "rewards/accuracy_reward/std": 3.4314992427825928, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 862.0, + "completions/max_terminated_length": 862.0, + "completions/mean_length": 539.703125, + "completions/mean_terminated_length": 539.703125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.6870090634441087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04419379681348801, + "learning_rate": 9.633515015191428e-07, + "loss": 0.0669, + "num_tokens": 192623343.0, + "reward": 3.751434326171875, + "reward_std": 2.40834379196167, + "rewards/accuracy_reward/mean": 3.001434326171875, + "rewards/accuracy_reward/std": 3.7126569747924805, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 535.046875, + "completions/mean_terminated_length": 535.046875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.6876132930513595, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02203783206641674, + "learning_rate": 9.61029991824923e-07, + "loss": 0.0029, + "num_tokens": 192766178.0, + "reward": 4.343400955200195, + "reward_std": 0.6502161026000977, + "rewards/accuracy_reward/mean": 3.5934014320373535, + "rewards/accuracy_reward/std": 3.6317567825317383, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1736.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 661.71875, + "completions/mean_terminated_length": 661.71875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.6882175226586102, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03189058601856232, + "learning_rate": 9.587112337911068e-07, + "loss": -0.0006, + "num_tokens": 192945056.0, + "reward": 3.221832752227783, + "reward_std": 0.9469137191772461, + "rewards/accuracy_reward/mean": 2.471832752227783, + "rewards/accuracy_reward/std": 3.5134122371673584, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 564.078125, + "completions/mean_terminated_length": 564.078125, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.6888217522658611, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044879667460918427, + "learning_rate": 9.563952366785246e-07, + "loss": -0.0066, + "num_tokens": 193134229.0, + "reward": 3.060112476348877, + "reward_std": 1.969469666481018, + "rewards/accuracy_reward/mean": 2.310112476348877, + "rewards/accuracy_reward/std": 3.497072219848633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 475.765625, + "completions/mean_terminated_length": 475.765625, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.6894259818731118, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023931631818413734, + "learning_rate": 9.540820097369798e-07, + "loss": 0.0045, + "num_tokens": 193302070.0, + "reward": 2.617926597595215, + "reward_std": 0.9578295946121216, + "rewards/accuracy_reward/mean": 1.8679265975952148, + "rewards/accuracy_reward/std": 3.2437257766723633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1089.0, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 608.9375, + "completions/mean_terminated_length": 608.9375, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.6900302114803626, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.057216063141822815, + "learning_rate": 9.51771562205214e-07, + "loss": 0.0037, + "num_tokens": 193453330.0, + "reward": 2.66046404838562, + "reward_std": 2.4173495769500732, + "rewards/accuracy_reward/mean": 1.9104640483856201, + "rewards/accuracy_reward/std": 3.1827633380889893, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 648.9375, + "completions/mean_terminated_length": 648.9375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.6906344410876133, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.004310499876737595, + "learning_rate": 9.494639033108658e-07, + "loss": 0.0029, + "num_tokens": 193687854.0, + "reward": 0.8142765760421753, + "reward_std": 0.09845054149627686, + "rewards/accuracy_reward/mean": 0.0642765611410141, + "rewards/accuracy_reward/std": 0.2225196808576584, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 502.90625, + "completions/mean_terminated_length": 502.90625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.691238670694864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04320104047656059, + "learning_rate": 9.471590422704374e-07, + "loss": 0.0041, + "num_tokens": 193857896.0, + "reward": 5.548962593078613, + "reward_std": 1.7288298606872559, + "rewards/accuracy_reward/mean": 4.802868843078613, + "rewards/accuracy_reward/std": 3.585681200027466, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 553.796875, + "completions/mean_terminated_length": 553.796875, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.6918429003021148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029460793361067772, + "learning_rate": 9.448569882892578e-07, + "loss": 0.0149, + "num_tokens": 193987835.0, + "reward": 5.718440532684326, + "reward_std": 0.9857555627822876, + "rewards/accuracy_reward/mean": 4.968441009521484, + "rewards/accuracy_reward/std": 3.3745031356811523, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 530.46875, + "completions/mean_terminated_length": 530.46875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.6924471299093655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03758201748132706, + "learning_rate": 9.425577505614431e-07, + "loss": -0.0004, + "num_tokens": 194154873.0, + "reward": 5.4653215408325195, + "reward_std": 2.0653152465820312, + "rewards/accuracy_reward/mean": 4.715322017669678, + "rewards/accuracy_reward/std": 3.4856748580932617, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1620.0, + "completions/max_terminated_length": 1620.0, + "completions/mean_length": 596.515625, + "completions/mean_terminated_length": 596.515625, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.6930513595166163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020924773067235947, + "learning_rate": 9.402613382698619e-07, + "loss": -0.0053, + "num_tokens": 194319194.0, + "reward": 2.57053279876709, + "reward_std": 1.0126829147338867, + "rewards/accuracy_reward/mean": 1.8205327987670898, + "rewards/accuracy_reward/std": 3.2556161880493164, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 582.953125, + "completions/mean_terminated_length": 582.953125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.693655589123867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0010946637485176325, + "learning_rate": 9.379677605860996e-07, + "loss": 0.0001, + "num_tokens": 194455703.0, + "reward": 6.380520343780518, + "reward_std": 0.05315232649445534, + "rewards/accuracy_reward/mean": 5.630520343780518, + "rewards/accuracy_reward/std": 3.049623727798462, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 695.796875, + "completions/mean_terminated_length": 695.796875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.6942598187311179, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024596113711595535, + "learning_rate": 9.3567702667042e-07, + "loss": 0.0103, + "num_tokens": 194606666.0, + "reward": 3.0247015953063965, + "reward_std": 1.1454187631607056, + "rewards/accuracy_reward/mean": 2.2747015953063965, + "rewards/accuracy_reward/std": 3.3846242427825928, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 798.0, + "completions/max_terminated_length": 798.0, + "completions/mean_length": 581.734375, + "completions/mean_terminated_length": 581.734375, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.6948640483383686, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05208968743681908, + "learning_rate": 9.333891456717289e-07, + "loss": -0.0188, + "num_tokens": 194765081.0, + "reward": 5.068273544311523, + "reward_std": 2.676107883453369, + "rewards/accuracy_reward/mean": 4.318273544311523, + "rewards/accuracy_reward/std": 3.660050630569458, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1110.0, + "completions/max_terminated_length": 1110.0, + "completions/mean_length": 644.484375, + "completions/mean_terminated_length": 644.484375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.6954682779456194, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.02215077169239521, + "learning_rate": 9.311041267275375e-07, + "loss": -0.0005, + "num_tokens": 194900760.0, + "reward": 1.0853703022003174, + "reward_std": 0.5979238152503967, + "rewards/accuracy_reward/mean": 0.3353703022003174, + "rewards/accuracy_reward/std": 1.3056529760360718, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 578.46875, + "completions/mean_terminated_length": 578.46875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.6960725075528701, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030131064355373383, + "learning_rate": 9.288219789639276e-07, + "loss": 0.0258, + "num_tokens": 195132982.0, + "reward": 3.5250000953674316, + "reward_std": 1.0458506345748901, + "rewards/accuracy_reward/mean": 2.7750000953674316, + "rewards/accuracy_reward/std": 3.6962642669677734, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 606.609375, + "completions/mean_terminated_length": 606.609375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.6966767371601208, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04084126278758049, + "learning_rate": 9.26542711495513e-07, + "loss": 0.0051, + "num_tokens": 195304317.0, + "reward": 3.758723258972168, + "reward_std": 2.099834680557251, + "rewards/accuracy_reward/mean": 3.008723258972168, + "rewards/accuracy_reward/std": 3.700418710708618, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 532.34375, + "completions/mean_terminated_length": 532.34375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.6972809667673716, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.029624072834849358, + "learning_rate": 9.242663334254032e-07, + "loss": -0.0053, + "num_tokens": 195442659.0, + "reward": 3.9896414279937744, + "reward_std": 1.2762036323547363, + "rewards/accuracy_reward/mean": 3.2396414279937744, + "rewards/accuracy_reward/std": 3.7363574504852295, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 695.625, + "completions/mean_terminated_length": 629.11474609375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.6978851963746223, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.005249666515737772, + "learning_rate": 9.219928538451701e-07, + "loss": -0.027, + "num_tokens": 195557211.0, + "reward": 2.547220230102539, + "reward_std": 0.31758129596710205, + "rewards/accuracy_reward/mean": 1.824563980102539, + "rewards/accuracy_reward/std": 3.272566080093384, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.13449780642986298, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 542.15625, + "completions/mean_terminated_length": 542.15625, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.6984894259818731, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03869938105344772, + "learning_rate": 9.197222818348071e-07, + "loss": 0.0174, + "num_tokens": 195714405.0, + "reward": 5.803211212158203, + "reward_std": 2.052837371826172, + "rewards/accuracy_reward/mean": 5.053211212158203, + "rewards/accuracy_reward/std": 3.5416312217712402, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 690.296875, + "completions/mean_terminated_length": 668.74609375, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.6990936555891238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0404365137219429, + "learning_rate": 9.174546264626964e-07, + "loss": -0.0229, + "num_tokens": 195885368.0, + "reward": 3.245267152786255, + "reward_std": 2.2436439990997314, + "rewards/accuracy_reward/mean": 2.506986141204834, + "rewards/accuracy_reward/std": 3.4850683212280273, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 619.09375, + "completions/mean_terminated_length": 619.09375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.6996978851963747, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03902258351445198, + "learning_rate": 9.1518989678557e-07, + "loss": 0.0008, + "num_tokens": 196028126.0, + "reward": 5.204816818237305, + "reward_std": 1.711134910583496, + "rewards/accuracy_reward/mean": 4.454816818237305, + "rewards/accuracy_reward/std": 3.6422817707061768, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 714.578125, + "completions/mean_terminated_length": 714.578125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.7003021148036254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03268194943666458, + "learning_rate": 9.129281018484779e-07, + "loss": 0.0014, + "num_tokens": 196209891.0, + "reward": 6.273694038391113, + "reward_std": 1.019364595413208, + "rewards/accuracy_reward/mean": 5.523694038391113, + "rewards/accuracy_reward/std": 3.329540491104126, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 647.296875, + "completions/mean_terminated_length": 625.0635375976562, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.7009063444108762, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024683784693479538, + "learning_rate": 9.106692506847469e-07, + "loss": -0.0184, + "num_tokens": 196493654.0, + "reward": 4.116968631744385, + "reward_std": 0.9056222438812256, + "rewards/accuracy_reward/mean": 3.3904061317443848, + "rewards/accuracy_reward/std": 3.8375935554504395, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1412.0, + "completions/max_terminated_length": 1412.0, + "completions/mean_length": 619.15625, + "completions/mean_terminated_length": 619.15625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.7015105740181269, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05926866829395294, + "learning_rate": 9.084133523159459e-07, + "loss": -0.0057, + "num_tokens": 196718144.0, + "reward": 2.816275119781494, + "reward_std": 1.8529748916625977, + "rewards/accuracy_reward/mean": 2.066275119781494, + "rewards/accuracy_reward/std": 3.4038403034210205, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1491.0, + "completions/max_terminated_length": 1491.0, + "completions/mean_length": 596.078125, + "completions/mean_terminated_length": 596.078125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.7021148036253776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04984031617641449, + "learning_rate": 9.061604157518531e-07, + "loss": 0.0304, + "num_tokens": 196860661.0, + "reward": 4.069007873535156, + "reward_std": 3.226038694381714, + "rewards/accuracy_reward/mean": 3.3190078735351562, + "rewards/accuracy_reward/std": 3.6671602725982666, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 514.734375, + "completions/mean_terminated_length": 514.734375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.7027190332326284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04290672764182091, + "learning_rate": 9.03910449990417e-07, + "loss": -0.012, + "num_tokens": 196998452.0, + "reward": 7.018614292144775, + "reward_std": 2.449401378631592, + "rewards/accuracy_reward/mean": 6.268613815307617, + "rewards/accuracy_reward/std": 2.789496421813965, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 575.1875, + "completions/mean_terminated_length": 575.1875, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.7033232628398791, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05068341642618179, + "learning_rate": 9.016634640177203e-07, + "loss": 0.0051, + "num_tokens": 197192928.0, + "reward": 6.67257022857666, + "reward_std": 2.2589924335479736, + "rewards/accuracy_reward/mean": 5.92257022857666, + "rewards/accuracy_reward/std": 3.0140464305877686, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 586.171875, + "completions/mean_terminated_length": 586.171875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.7039274924471299, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.036169592291116714, + "learning_rate": 8.99419466807944e-07, + "loss": 0.0108, + "num_tokens": 197369147.0, + "reward": 4.944447040557861, + "reward_std": 1.4034453630447388, + "rewards/accuracy_reward/mean": 4.194447040557861, + "rewards/accuracy_reward/std": 3.6926651000976562, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 600.640625, + "completions/mean_terminated_length": 600.640625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.7045317220543806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051359813660383224, + "learning_rate": 8.971784673233349e-07, + "loss": 0.0043, + "num_tokens": 197545716.0, + "reward": 4.341195106506348, + "reward_std": 2.3828725814819336, + "rewards/accuracy_reward/mean": 3.5911951065063477, + "rewards/accuracy_reward/std": 3.9964077472686768, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 521.1875, + "completions/mean_terminated_length": 521.1875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.7051359516616315, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025830749422311783, + "learning_rate": 8.949404745141655e-07, + "loss": 0.0002, + "num_tokens": 197730896.0, + "reward": 6.293841361999512, + "reward_std": 0.9250451326370239, + "rewards/accuracy_reward/mean": 5.543841361999512, + "rewards/accuracy_reward/std": 3.2117748260498047, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 470.25, + "completions/mean_terminated_length": 470.25, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.7057401812688822, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02491951547563076, + "learning_rate": 8.927054973186995e-07, + "loss": -0.011, + "num_tokens": 197880304.0, + "reward": 4.536284446716309, + "reward_std": 0.49704509973526, + "rewards/accuracy_reward/mean": 3.7862842082977295, + "rewards/accuracy_reward/std": 3.7658350467681885, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 640.234375, + "completions/mean_terminated_length": 640.234375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.706344410876133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0561741441488266, + "learning_rate": 8.904735446631587e-07, + "loss": 0.0028, + "num_tokens": 198050383.0, + "reward": 3.8358092308044434, + "reward_std": 2.580744981765747, + "rewards/accuracy_reward/mean": 3.0858094692230225, + "rewards/accuracy_reward/std": 3.7306549549102783, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1277.0, + "completions/max_terminated_length": 1277.0, + "completions/mean_length": 577.96875, + "completions/mean_terminated_length": 577.96875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.7069486404833837, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01614748314023018, + "learning_rate": 8.882446254616833e-07, + "loss": 0.0022, + "num_tokens": 198195565.0, + "reward": 4.436890602111816, + "reward_std": 0.4721169173717499, + "rewards/accuracy_reward/mean": 3.6868906021118164, + "rewards/accuracy_reward/std": 3.607131242752075, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 790.0, + "completions/mean_length": 537.0625, + "completions/mean_terminated_length": 513.0794067382812, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.7075528700906344, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.054519977420568466, + "learning_rate": 8.860187486162985e-07, + "loss": -0.0517, + "num_tokens": 198357009.0, + "reward": 5.939263343811035, + "reward_std": 2.311136245727539, + "rewards/accuracy_reward/mean": 5.204888820648193, + "rewards/accuracy_reward/std": 3.4444010257720947, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 546.921875, + "completions/mean_terminated_length": 546.921875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.7081570996978852, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03258447349071503, + "learning_rate": 8.837959230168804e-07, + "loss": -0.0199, + "num_tokens": 198513804.0, + "reward": 2.927473306655884, + "reward_std": 1.2788455486297607, + "rewards/accuracy_reward/mean": 2.181379556655884, + "rewards/accuracy_reward/std": 3.3340706825256348, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 521.578125, + "completions/mean_terminated_length": 521.578125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.7087613293051359, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039874762296676636, + "learning_rate": 8.81576157541117e-07, + "loss": 0.0248, + "num_tokens": 198651345.0, + "reward": 5.152478218078613, + "reward_std": 1.8072441816329956, + "rewards/accuracy_reward/mean": 4.402478218078613, + "rewards/accuracy_reward/std": 3.776153087615967, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 542.390625, + "completions/mean_terminated_length": 542.390625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.7093655589123867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03276903182268143, + "learning_rate": 8.793594610544745e-07, + "loss": 0.0065, + "num_tokens": 198846762.0, + "reward": 4.23658561706543, + "reward_std": 1.3752011060714722, + "rewards/accuracy_reward/mean": 3.486585855484009, + "rewards/accuracy_reward/std": 3.9595255851745605, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 643.46875, + "completions/mean_terminated_length": 643.46875, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.7099697885196374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03486613929271698, + "learning_rate": 8.771458424101633e-07, + "loss": 0.0021, + "num_tokens": 199062776.0, + "reward": 3.3300342559814453, + "reward_std": 1.0565526485443115, + "rewards/accuracy_reward/mean": 2.5800344944000244, + "rewards/accuracy_reward/std": 3.568333625793457, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1084.0, + "completions/max_terminated_length": 1084.0, + "completions/mean_length": 558.875, + "completions/mean_terminated_length": 558.875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.7105740181268883, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0018144305795431137, + "learning_rate": 8.74935310449101e-07, + "loss": -0.0003, + "num_tokens": 199232768.0, + "reward": 2.661196708679199, + "reward_std": 0.06268125772476196, + "rewards/accuracy_reward/mean": 1.9111968278884888, + "rewards/accuracy_reward/std": 3.230579376220703, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 500.546875, + "completions/mean_terminated_length": 500.546875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.711178247734139, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025339441373944283, + "learning_rate": 8.727278739998765e-07, + "loss": 0.0051, + "num_tokens": 199391107.0, + "reward": 6.321578025817871, + "reward_std": 1.0142998695373535, + "rewards/accuracy_reward/mean": 5.571578025817871, + "rewards/accuracy_reward/std": 3.2809817790985107, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 544.34375, + "completions/mean_terminated_length": 544.34375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.7117824773413898, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.051296815276145935, + "learning_rate": 8.705235418787152e-07, + "loss": 0.0443, + "num_tokens": 199555513.0, + "reward": 5.258817195892334, + "reward_std": 2.192005157470703, + "rewards/accuracy_reward/mean": 4.508817195892334, + "rewards/accuracy_reward/std": 3.6603760719299316, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 532.203125, + "completions/mean_terminated_length": 508.14288330078125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.7123867069486405, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.035216886550188065, + "learning_rate": 8.683223228894465e-07, + "loss": -0.0331, + "num_tokens": 199729174.0, + "reward": 3.032101631164551, + "reward_std": 0.8785077333450317, + "rewards/accuracy_reward/mean": 2.286007881164551, + "rewards/accuracy_reward/std": 3.454341173171997, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 538.296875, + "completions/mean_terminated_length": 538.296875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.7129909365558912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04013289883732796, + "learning_rate": 8.661242258234642e-07, + "loss": 0.0007, + "num_tokens": 199890137.0, + "reward": 7.168487548828125, + "reward_std": 1.8745368719100952, + "rewards/accuracy_reward/mean": 6.418487548828125, + "rewards/accuracy_reward/std": 2.5771822929382324, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 569.828125, + "completions/mean_terminated_length": 569.828125, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.713595166163142, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03726939111948013, + "learning_rate": 8.639292594596936e-07, + "loss": 0.0235, + "num_tokens": 200066734.0, + "reward": 3.7852468490600586, + "reward_std": 1.5148141384124756, + "rewards/accuracy_reward/mean": 3.0352468490600586, + "rewards/accuracy_reward/std": 3.569028377532959, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 546.171875, + "completions/mean_terminated_length": 546.171875, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.7141993957703927, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06431790441274643, + "learning_rate": 8.617374325645582e-07, + "loss": 0.0215, + "num_tokens": 200224025.0, + "reward": 4.198570251464844, + "reward_std": 3.0704593658447266, + "rewards/accuracy_reward/mean": 3.4485702514648438, + "rewards/accuracy_reward/std": 3.734119415283203, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 487.40625, + "completions/mean_terminated_length": 487.40625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.7148036253776435, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03605975583195686, + "learning_rate": 8.595487538919409e-07, + "loss": -0.0017, + "num_tokens": 200380547.0, + "reward": 4.106305122375488, + "reward_std": 1.5087954998016357, + "rewards/accuracy_reward/mean": 3.35630464553833, + "rewards/accuracy_reward/std": 3.7055776119232178, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1158.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 619.265625, + "completions/mean_terminated_length": 619.265625, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.7154078549848942, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02493307553231716, + "learning_rate": 8.573632321831514e-07, + "loss": 0.0108, + "num_tokens": 200515332.0, + "reward": 4.150230884552002, + "reward_std": 1.1523311138153076, + "rewards/accuracy_reward/mean": 3.400230884552002, + "rewards/accuracy_reward/std": 3.707750082015991, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 527.03125, + "completions/mean_terminated_length": 527.03125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.716012084592145, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04485708475112915, + "learning_rate": 8.551808761668921e-07, + "loss": 0.0056, + "num_tokens": 200757606.0, + "reward": 3.0831189155578613, + "reward_std": 1.8397166728973389, + "rewards/accuracy_reward/mean": 2.3331189155578613, + "rewards/accuracy_reward/std": 3.460862398147583, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 601.609375, + "completions/mean_terminated_length": 601.609375, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "epoch": 0.7166163141993958, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0390147864818573, + "learning_rate": 8.530016945592208e-07, + "loss": -0.0041, + "num_tokens": 200916973.0, + "reward": 5.057071685791016, + "reward_std": 1.524283528327942, + "rewards/accuracy_reward/mean": 4.307071685791016, + "rewards/accuracy_reward/std": 3.8260388374328613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 542.125, + "completions/mean_terminated_length": 542.125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.7172205438066466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05271654948592186, + "learning_rate": 8.508256960635172e-07, + "loss": 0.0012, + "num_tokens": 201063781.0, + "reward": 5.207575798034668, + "reward_std": 2.616055965423584, + "rewards/accuracy_reward/mean": 4.457575798034668, + "rewards/accuracy_reward/std": 3.606743812561035, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 580.5625, + "completions/mean_terminated_length": 580.5625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.7178247734138973, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0528738833963871, + "learning_rate": 8.486528893704481e-07, + "loss": -0.0262, + "num_tokens": 201280537.0, + "reward": 2.4756920337677, + "reward_std": 1.9444005489349365, + "rewards/accuracy_reward/mean": 1.7256921529769897, + "rewards/accuracy_reward/std": 3.1833412647247314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1108.0, + "completions/max_terminated_length": 1108.0, + "completions/mean_length": 561.21875, + "completions/mean_terminated_length": 561.21875, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.718429003021148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05030902475118637, + "learning_rate": 8.464832831579328e-07, + "loss": -0.0007, + "num_tokens": 201432743.0, + "reward": 4.296179294586182, + "reward_std": 2.197256088256836, + "rewards/accuracy_reward/mean": 3.54617977142334, + "rewards/accuracy_reward/std": 3.6862447261810303, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 586.1875, + "completions/mean_terminated_length": 562.984130859375, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.7190332326283988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0526767298579216, + "learning_rate": 8.443168860911092e-07, + "loss": -0.0258, + "num_tokens": 201573187.0, + "reward": 4.567015647888184, + "reward_std": 2.9928903579711914, + "rewards/accuracy_reward/mean": 3.8287343978881836, + "rewards/accuracy_reward/std": 3.6619176864624023, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 505.453125, + "completions/mean_terminated_length": 505.453125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.7196374622356495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033645376563072205, + "learning_rate": 8.421537068222967e-07, + "loss": 0.0217, + "num_tokens": 201705904.0, + "reward": 5.699625015258789, + "reward_std": 0.9737978577613831, + "rewards/accuracy_reward/mean": 4.949625015258789, + "rewards/accuracy_reward/std": 3.3698980808258057, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 747.0, + "completions/max_terminated_length": 747.0, + "completions/mean_length": 514.75, + "completions/mean_terminated_length": 514.75, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.7202416918429003, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0022643052507191896, + "learning_rate": 8.399937539909634e-07, + "loss": -0.0, + "num_tokens": 201831872.0, + "reward": 2.6065516471862793, + "reward_std": 0.09701729565858841, + "rewards/accuracy_reward/mean": 1.8565516471862793, + "rewards/accuracy_reward/std": 3.2640960216522217, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 511.46875, + "completions/mean_terminated_length": 511.46875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.720845921450151, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03715214505791664, + "learning_rate": 8.378370362236931e-07, + "loss": -0.0023, + "num_tokens": 201978494.0, + "reward": 5.648435592651367, + "reward_std": 1.4320762157440186, + "rewards/accuracy_reward/mean": 4.898435592651367, + "rewards/accuracy_reward/std": 3.573341131210327, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 647.640625, + "completions/mean_terminated_length": 647.640625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "epoch": 0.7214501510574018, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.003457053564488888, + "learning_rate": 8.356835621341471e-07, + "loss": 0.0009, + "num_tokens": 202182295.0, + "reward": 2.5968751907348633, + "reward_std": 0.07032991945743561, + "rewards/accuracy_reward/mean": 1.8468749523162842, + "rewards/accuracy_reward/std": 3.2629573345184326, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 616.921875, + "completions/mean_terminated_length": 594.2063598632812, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 0.7220543806646526, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0379020981490612, + "learning_rate": 8.335333403230324e-07, + "loss": -0.0061, + "num_tokens": 202339906.0, + "reward": 3.615060806274414, + "reward_std": 1.6238770484924316, + "rewards/accuracy_reward/mean": 2.876779556274414, + "rewards/accuracy_reward/std": 3.6760432720184326, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 494.46875, + "completions/mean_terminated_length": 494.46875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.7226586102719034, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.037894297391176224, + "learning_rate": 8.313863793780681e-07, + "loss": -0.0137, + "num_tokens": 202505216.0, + "reward": 5.1586809158325195, + "reward_std": 1.7467162609100342, + "rewards/accuracy_reward/mean": 4.4086809158325195, + "rewards/accuracy_reward/std": 3.7097578048706055, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 626.78125, + "completions/mean_terminated_length": 626.78125, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "epoch": 0.7232628398791541, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07659261673688889, + "learning_rate": 8.292426878739483e-07, + "loss": -0.0069, + "num_tokens": 202690402.0, + "reward": 6.24092960357666, + "reward_std": 3.0600075721740723, + "rewards/accuracy_reward/mean": 5.49092960357666, + "rewards/accuracy_reward/std": 3.295745372772217, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 586.6875, + "completions/mean_terminated_length": 586.6875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.7238670694864048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037531495094299316, + "learning_rate": 8.271022743723094e-07, + "loss": -0.0056, + "num_tokens": 202818894.0, + "reward": 4.007770538330078, + "reward_std": 1.8846731185913086, + "rewards/accuracy_reward/mean": 3.265582799911499, + "rewards/accuracy_reward/std": 3.726879119873047, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 617.90625, + "completions/mean_terminated_length": 617.90625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.7244712990936556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03781775385141373, + "learning_rate": 8.249651474216974e-07, + "loss": 0.0175, + "num_tokens": 202993816.0, + "reward": 5.55238151550293, + "reward_std": 1.0440149307250977, + "rewards/accuracy_reward/mean": 4.80238151550293, + "rewards/accuracy_reward/std": 3.578618288040161, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 948.0, + "completions/max_terminated_length": 948.0, + "completions/mean_length": 567.15625, + "completions/mean_terminated_length": 567.15625, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.7250755287009063, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04829377308487892, + "learning_rate": 8.228313155575304e-07, + "loss": 0.0024, + "num_tokens": 203170786.0, + "reward": 4.564976692199707, + "reward_std": 2.5928707122802734, + "rewards/accuracy_reward/mean": 3.813023567199707, + "rewards/accuracy_reward/std": 3.7307629585266113, + "rewards/tag_count_reward/mean": 0.751953125, + "rewards/tag_count_reward/std": 0.015625, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 563.390625, + "completions/mean_terminated_length": 515.5, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.7256797583081571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04181830585002899, + "learning_rate": 8.207007873020669e-07, + "loss": -0.0439, + "num_tokens": 203317419.0, + "reward": 3.697357654571533, + "reward_std": 1.7803384065628052, + "rewards/accuracy_reward/mean": 2.970795154571533, + "rewards/accuracy_reward/std": 3.6194164752960205, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 547.515625, + "completions/mean_terminated_length": 523.6984252929688, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.7262839879154078, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05887909233570099, + "learning_rate": 8.185735711643722e-07, + "loss": 0.0211, + "num_tokens": 203506028.0, + "reward": 5.221889972686768, + "reward_std": 3.205850601196289, + "rewards/accuracy_reward/mean": 4.483609199523926, + "rewards/accuracy_reward/std": 3.686880111694336, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 549.203125, + "completions/mean_terminated_length": 549.203125, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7268882175226586, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.046771228313446045, + "learning_rate": 8.164496756402818e-07, + "loss": 0.0913, + "num_tokens": 203655561.0, + "reward": 4.887584209442139, + "reward_std": 1.8399821519851685, + "rewards/accuracy_reward/mean": 4.137584209442139, + "rewards/accuracy_reward/std": 3.6780717372894287, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 698.296875, + "completions/mean_terminated_length": 631.91796875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.7274924471299093, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05467850714921951, + "learning_rate": 8.143291092123708e-07, + "loss": 0.0236, + "num_tokens": 203847708.0, + "reward": 1.7896265983581543, + "reward_std": 2.2604446411132812, + "rewards/accuracy_reward/mean": 1.0747828483581543, + "rewards/accuracy_reward/std": 2.747459650039673, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1728.0, + "completions/max_terminated_length": 1728.0, + "completions/mean_length": 474.4375, + "completions/mean_terminated_length": 474.4375, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.7280966767371602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041028376668691635, + "learning_rate": 8.122118803499163e-07, + "loss": -0.0019, + "num_tokens": 203998184.0, + "reward": 6.620664596557617, + "reward_std": 1.8917791843414307, + "rewards/accuracy_reward/mean": 5.870664596557617, + "rewards/accuracy_reward/std": 3.0304019451141357, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 652.171875, + "completions/mean_terminated_length": 652.171875, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.7287009063444109, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03129761666059494, + "learning_rate": 8.100979975088678e-07, + "loss": -0.0102, + "num_tokens": 204203379.0, + "reward": 2.7230045795440674, + "reward_std": 1.4283387660980225, + "rewards/accuracy_reward/mean": 1.973004698753357, + "rewards/accuracy_reward/std": 3.3277440071105957, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 575.875, + "completions/mean_terminated_length": 575.875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.7293051359516616, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02129638008773327, + "learning_rate": 8.079874691318097e-07, + "loss": -0.0029, + "num_tokens": 204372859.0, + "reward": 4.214322090148926, + "reward_std": 1.0176275968551636, + "rewards/accuracy_reward/mean": 3.4643218517303467, + "rewards/accuracy_reward/std": 3.7843170166015625, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1066.0, + "completions/mean_length": 599.421875, + "completions/mean_terminated_length": 476.6610107421875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.7299093655589124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026648791506886482, + "learning_rate": 8.058803036479289e-07, + "loss": -0.0318, + "num_tokens": 204505286.0, + "reward": 4.734027862548828, + "reward_std": 1.0709583759307861, + "rewards/accuracy_reward/mean": 4.04262113571167, + "rewards/accuracy_reward/std": 3.76123046875, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.2028672844171524, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 781.09375, + "completions/mean_terminated_length": 760.9841918945312, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.7305135951661631, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035085730254650116, + "learning_rate": 8.037765094729825e-07, + "loss": -0.0376, + "num_tokens": 204764940.0, + "reward": 7.433682918548584, + "reward_std": 0.9790116548538208, + "rewards/accuracy_reward/mean": 6.687588691711426, + "rewards/accuracy_reward/std": 2.2147858142852783, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 489.296875, + "completions/mean_terminated_length": 489.296875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.7311178247734139, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027722857892513275, + "learning_rate": 8.016760950092626e-07, + "loss": 0.0121, + "num_tokens": 204911519.0, + "reward": 4.427918910980225, + "reward_std": 1.6856052875518799, + "rewards/accuracy_reward/mean": 3.6818253993988037, + "rewards/accuracy_reward/std": 3.778015375137329, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.0, + "completions/max_terminated_length": 1179.0, + "completions/mean_length": 670.765625, + "completions/mean_terminated_length": 670.765625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.7317220543806646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.019329771399497986, + "learning_rate": 7.995790686455621e-07, + "loss": 0.0095, + "num_tokens": 205093728.0, + "reward": 6.082988739013672, + "reward_std": 0.7359715700149536, + "rewards/accuracy_reward/mean": 5.33298921585083, + "rewards/accuracy_reward/std": 3.436636447906494, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 527.53125, + "completions/mean_terminated_length": 527.53125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.7323262839879154, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.044785793870687485, + "learning_rate": 7.97485438757144e-07, + "loss": 0.0049, + "num_tokens": 205253906.0, + "reward": 4.939362525939941, + "reward_std": 2.158292770385742, + "rewards/accuracy_reward/mean": 4.189362525939941, + "rewards/accuracy_reward/std": 3.7608368396759033, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 522.078125, + "completions/mean_terminated_length": 522.078125, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.7329305135951661, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.041546184569597244, + "learning_rate": 7.953952137057048e-07, + "loss": -0.0028, + "num_tokens": 205398967.0, + "reward": 5.018184185028076, + "reward_std": 1.7460978031158447, + "rewards/accuracy_reward/mean": 4.268184185028076, + "rewards/accuracy_reward/std": 3.74271559715271, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 566.515625, + "completions/mean_terminated_length": 543.0000610351562, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "epoch": 0.733534743202417, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.036871425807476044, + "learning_rate": 7.933084018393434e-07, + "loss": -0.023, + "num_tokens": 205530088.0, + "reward": 4.068576812744141, + "reward_std": 1.7613446712493896, + "rewards/accuracy_reward/mean": 3.3302953243255615, + "rewards/accuracy_reward/std": 3.754958152770996, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 488.234375, + "completions/mean_terminated_length": 488.234375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.7341389728096677, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0004475013993214816, + "learning_rate": 7.912250114925259e-07, + "loss": -0.0001, + "num_tokens": 205696999.0, + "reward": 8.169711112976074, + "reward_std": 0.029514282941818237, + "rewards/accuracy_reward/mean": 7.419711112976074, + "rewards/accuracy_reward/std": 0.06721202284097672, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1238.0, + "completions/max_terminated_length": 1238.0, + "completions/mean_length": 632.859375, + "completions/mean_terminated_length": 632.859375, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.7347432024169185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035354506224393845, + "learning_rate": 7.891450509860541e-07, + "loss": -0.0024, + "num_tokens": 205866830.0, + "reward": 3.2979984283447266, + "reward_std": 1.4700857400894165, + "rewards/accuracy_reward/mean": 2.5479984283447266, + "rewards/accuracy_reward/std": 3.5057332515716553, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 699.609375, + "completions/mean_terminated_length": 678.2063598632812, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.7353474320241692, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0366104319691658, + "learning_rate": 7.870685286270319e-07, + "loss": 0.015, + "num_tokens": 206077173.0, + "reward": 2.6656670570373535, + "reward_std": 1.448268175125122, + "rewards/accuracy_reward/mean": 1.9234795570373535, + "rewards/accuracy_reward/std": 3.3374123573303223, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.043842025101184845, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1095.0, + "completions/max_terminated_length": 1095.0, + "completions/mean_length": 612.1875, + "completions/mean_terminated_length": 612.1875, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.7359516616314199, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04815109446644783, + "learning_rate": 7.849954527088299e-07, + "loss": 0.046, + "num_tokens": 206223441.0, + "reward": 5.062108039855957, + "reward_std": 1.8165220022201538, + "rewards/accuracy_reward/mean": 4.312108039855957, + "rewards/accuracy_reward/std": 3.703756332397461, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 550.984375, + "completions/mean_terminated_length": 550.984375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.7365558912386707, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04031991586089134, + "learning_rate": 7.829258315110562e-07, + "loss": 0.0177, + "num_tokens": 206381776.0, + "reward": 3.2019970417022705, + "reward_std": 1.7919288873672485, + "rewards/accuracy_reward/mean": 2.4519970417022705, + "rewards/accuracy_reward/std": 3.5211217403411865, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 451.78125, + "completions/mean_terminated_length": 451.78125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.7371601208459214, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.012762481346726418, + "learning_rate": 7.808596732995194e-07, + "loss": -0.0038, + "num_tokens": 206520514.0, + "reward": 2.821307897567749, + "reward_std": 0.44911086559295654, + "rewards/accuracy_reward/mean": 2.071307897567749, + "rewards/accuracy_reward/std": 3.261241912841797, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 505.203125, + "completions/mean_terminated_length": 505.203125, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.7377643504531722, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.058224424719810486, + "learning_rate": 7.787969863261984e-07, + "loss": -0.0009, + "num_tokens": 206685903.0, + "reward": 4.408641815185547, + "reward_std": 1.888862133026123, + "rewards/accuracy_reward/mean": 3.658642292022705, + "rewards/accuracy_reward/std": 3.8208911418914795, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 543.25, + "completions/mean_terminated_length": 543.25, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.7383685800604229, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0450594387948513, + "learning_rate": 7.767377788292071e-07, + "loss": -0.0019, + "num_tokens": 206815647.0, + "reward": 1.817275047302246, + "reward_std": 1.75613534450531, + "rewards/accuracy_reward/mean": 1.0672749280929565, + "rewards/accuracy_reward/std": 2.6377952098846436, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 556.109375, + "completions/mean_terminated_length": 556.109375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.7389728096676738, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016401074826717377, + "learning_rate": 7.746820590327651e-07, + "loss": -0.0015, + "num_tokens": 206950966.0, + "reward": 6.390707969665527, + "reward_std": 0.49381470680236816, + "rewards/accuracy_reward/mean": 5.640707969665527, + "rewards/accuracy_reward/std": 3.1462602615356445, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 484.375, + "completions/mean_terminated_length": 484.375, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.7395770392749245, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0299379862844944, + "learning_rate": 7.726298351471607e-07, + "loss": -0.0127, + "num_tokens": 207123502.0, + "reward": 2.7249155044555664, + "reward_std": 1.4848988056182861, + "rewards/accuracy_reward/mean": 1.974915623664856, + "rewards/accuracy_reward/std": 3.3196890354156494, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 642.453125, + "completions/mean_terminated_length": 642.453125, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.7401812688821753, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025772355496883392, + "learning_rate": 7.705811153687202e-07, + "loss": 0.0103, + "num_tokens": 207298267.0, + "reward": 4.753775119781494, + "reward_std": 0.7780647277832031, + "rewards/accuracy_reward/mean": 4.003775119781494, + "rewards/accuracy_reward/std": 3.7009506225585938, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 546.5625, + "completions/mean_terminated_length": 546.5625, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.740785498489426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053778938949108124, + "learning_rate": 7.685359078797759e-07, + "loss": 0.0005, + "num_tokens": 207456031.0, + "reward": 3.9980015754699707, + "reward_std": 2.910159111022949, + "rewards/accuracy_reward/mean": 3.2480015754699707, + "rewards/accuracy_reward/std": 3.7253129482269287, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1382.0, + "completions/max_terminated_length": 1382.0, + "completions/mean_length": 629.421875, + "completions/mean_terminated_length": 629.421875, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7413897280966767, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03795502707362175, + "learning_rate": 7.664942208486313e-07, + "loss": 0.0027, + "num_tokens": 207702714.0, + "reward": 2.0008671283721924, + "reward_std": 1.7061470746994019, + "rewards/accuracy_reward/mean": 1.2508671283721924, + "rewards/accuracy_reward/std": 2.7081024646759033, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 517.0, + "completions/mean_terminated_length": 517.0, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.7419939577039275, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04563801363110542, + "learning_rate": 7.644560624295297e-07, + "loss": -0.0195, + "num_tokens": 207843770.0, + "reward": 3.9264330863952637, + "reward_std": 2.0976216793060303, + "rewards/accuracy_reward/mean": 3.1764330863952637, + "rewards/accuracy_reward/std": 3.6879782676696777, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 508.40625, + "completions/mean_terminated_length": 508.40625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.7425981873111782, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028802750632166862, + "learning_rate": 7.62421440762623e-07, + "loss": 0.0025, + "num_tokens": 208018900.0, + "reward": 4.346315860748291, + "reward_std": 1.1083223819732666, + "rewards/accuracy_reward/mean": 3.596315622329712, + "rewards/accuracy_reward/std": 3.7399094104766846, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 500.25, + "completions/mean_terminated_length": 500.25, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.743202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05459226295351982, + "learning_rate": 7.603903639739358e-07, + "loss": 0.0282, + "num_tokens": 208167732.0, + "reward": 3.2284934520721436, + "reward_std": 3.119936943054199, + "rewards/accuracy_reward/mean": 2.4784932136535645, + "rewards/accuracy_reward/std": 3.4606947898864746, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 499.734375, + "completions/mean_terminated_length": 499.734375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.7438066465256797, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052489496767520905, + "learning_rate": 7.583628401753368e-07, + "loss": 0.0156, + "num_tokens": 208325059.0, + "reward": 6.150518417358398, + "reward_std": 2.546288013458252, + "rewards/accuracy_reward/mean": 5.400518417358398, + "rewards/accuracy_reward/std": 3.274322271347046, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 880.0, + "completions/max_terminated_length": 880.0, + "completions/mean_length": 568.0625, + "completions/mean_terminated_length": 568.0625, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "epoch": 0.7444108761329306, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03174813091754913, + "learning_rate": 7.563388774645023e-07, + "loss": -0.0066, + "num_tokens": 208505607.0, + "reward": 3.164064407348633, + "reward_std": 0.9303189516067505, + "rewards/accuracy_reward/mean": 2.4179701805114746, + "rewards/accuracy_reward/std": 3.5587406158447266, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 682.375, + "completions/mean_terminated_length": 682.375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7450151057401813, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.043044447898864746, + "learning_rate": 7.543184839248888e-07, + "loss": -0.0055, + "num_tokens": 208699023.0, + "reward": 3.5216171741485596, + "reward_std": 1.473963737487793, + "rewards/accuracy_reward/mean": 2.7716171741485596, + "rewards/accuracy_reward/std": 3.618950128555298, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 538.28125, + "completions/mean_terminated_length": 538.28125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.7456193353474321, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05541432276368141, + "learning_rate": 7.523016676256953e-07, + "loss": 0.0181, + "num_tokens": 208860161.0, + "reward": 3.407417058944702, + "reward_std": 2.721238613128662, + "rewards/accuracy_reward/mean": 2.6574172973632812, + "rewards/accuracy_reward/std": 3.576298952102661, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 687.0, + "completions/max_terminated_length": 687.0, + "completions/mean_length": 498.265625, + "completions/mean_terminated_length": 498.265625, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.7462235649546828, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.021957598626613617, + "learning_rate": 7.502884366218346e-07, + "loss": 0.004, + "num_tokens": 209019794.0, + "reward": 4.810722827911377, + "reward_std": 0.7476511001586914, + "rewards/accuracy_reward/mean": 4.060723781585693, + "rewards/accuracy_reward/std": 3.7256858348846436, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 552.28125, + "completions/mean_terminated_length": 552.28125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.7468277945619335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02856624312698841, + "learning_rate": 7.482787989539021e-07, + "loss": 0.0079, + "num_tokens": 209231236.0, + "reward": 5.715888977050781, + "reward_std": 0.9687886834144592, + "rewards/accuracy_reward/mean": 4.965888977050781, + "rewards/accuracy_reward/std": 3.5336079597473145, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 496.28125, + "completions/mean_terminated_length": 496.28125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7474320241691843, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04383406043052673, + "learning_rate": 7.462727626481393e-07, + "loss": 0.0093, + "num_tokens": 209377302.0, + "reward": 6.994439125061035, + "reward_std": 2.240485668182373, + "rewards/accuracy_reward/mean": 6.244439125061035, + "rewards/accuracy_reward/std": 2.7550201416015625, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 581.421875, + "completions/mean_terminated_length": 581.421875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.748036253776435, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03945824131369591, + "learning_rate": 7.442703357164051e-07, + "loss": -0.0048, + "num_tokens": 209511441.0, + "reward": 2.1506078243255615, + "reward_std": 1.3618497848510742, + "rewards/accuracy_reward/mean": 1.4006078243255615, + "rewards/accuracy_reward/std": 2.9386627674102783, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1375.0, + "completions/max_terminated_length": 1375.0, + "completions/mean_length": 616.875, + "completions/mean_terminated_length": 616.875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.7486404833836858, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.014016296714544296, + "learning_rate": 7.422715261561441e-07, + "loss": 0.0104, + "num_tokens": 209764729.0, + "reward": 0.7033437490463257, + "reward_std": 0.5374862551689148, + "rewards/accuracy_reward/mean": -0.046656250953674316, + "rewards/accuracy_reward/std": 1.0578206777572632, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2028.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 842.15625, + "completions/mean_terminated_length": 842.15625, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.7492447129909365, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02190323732793331, + "learning_rate": 7.402763419503524e-07, + "loss": 0.011, + "num_tokens": 209962867.0, + "reward": 4.020803928375244, + "reward_std": 1.009249210357666, + "rewards/accuracy_reward/mean": 3.270803928375244, + "rewards/accuracy_reward/std": 3.716238021850586, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 624.171875, + "completions/mean_terminated_length": 601.5714721679688, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.7498489425981874, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05565660446882248, + "learning_rate": 7.382847910675466e-07, + "loss": -0.0587, + "num_tokens": 210131886.0, + "reward": 3.873652935028076, + "reward_std": 2.3551015853881836, + "rewards/accuracy_reward/mean": 3.1353719234466553, + "rewards/accuracy_reward/std": 3.7333829402923584, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 576.203125, + "completions/mean_terminated_length": 552.84130859375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.7504531722054381, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.018576469272375107, + "learning_rate": 7.362968814617341e-07, + "loss": -0.0081, + "num_tokens": 210298363.0, + "reward": 2.6733450889587402, + "reward_std": 0.6444448828697205, + "rewards/accuracy_reward/mean": 1.9350640773773193, + "rewards/accuracy_reward/std": 3.2746927738189697, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1135.0, + "completions/mean_length": 556.859375, + "completions/mean_terminated_length": 508.758056640625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.7510574018126889, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024622736498713493, + "learning_rate": 7.34312621072377e-07, + "loss": -0.0544, + "num_tokens": 210529682.0, + "reward": 2.3769516944885254, + "reward_std": 1.3803455829620361, + "rewards/accuracy_reward/mean": 1.6503890752792358, + "rewards/accuracy_reward/std": 3.210944414138794, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 749.75, + "completions/mean_terminated_length": 639.7288208007812, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.7516616314199396, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0404951311647892, + "learning_rate": 7.323320178243652e-07, + "loss": -0.0671, + "num_tokens": 210674770.0, + "reward": 2.947092056274414, + "reward_std": 1.0236836671829224, + "rewards/accuracy_reward/mean": 2.255685806274414, + "rewards/accuracy_reward/std": 3.5430502891540527, + "rewards/tag_count_reward/mean": 0.69140625, + "rewards/tag_count_reward/std": 0.2028672844171524, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 513.125, + "completions/mean_terminated_length": 513.125, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.7522658610271903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04791291058063507, + "learning_rate": 7.303550796279808e-07, + "loss": -0.0108, + "num_tokens": 210803994.0, + "reward": 6.433773040771484, + "reward_std": 2.41111421585083, + "rewards/accuracy_reward/mean": 5.683773517608643, + "rewards/accuracy_reward/std": 3.143895149230957, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 635.703125, + "completions/mean_terminated_length": 635.703125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "epoch": 0.7528700906344411, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02483246847987175, + "learning_rate": 7.283818143788691e-07, + "loss": -0.0082, + "num_tokens": 210962471.0, + "reward": 4.749053478240967, + "reward_std": 0.7096343040466309, + "rewards/accuracy_reward/mean": 3.9990532398223877, + "rewards/accuracy_reward/std": 3.701028823852539, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 766.96875, + "completions/mean_terminated_length": 766.96875, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "epoch": 0.7534743202416918, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04916071519255638, + "learning_rate": 7.264122299580056e-07, + "loss": 0.0034, + "num_tokens": 211131157.0, + "reward": 3.730243682861328, + "reward_std": 2.436717987060547, + "rewards/accuracy_reward/mean": 2.980243682861328, + "rewards/accuracy_reward/std": 3.4906346797943115, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 533.28125, + "completions/mean_terminated_length": 533.28125, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.7540785498489426, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02411770448088646, + "learning_rate": 7.244463342316648e-07, + "loss": 0.0009, + "num_tokens": 211276199.0, + "reward": 1.2928828001022339, + "reward_std": 1.2309362888336182, + "rewards/accuracy_reward/mean": 0.5428828597068787, + "rewards/accuracy_reward/std": 1.8111581802368164, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 501.65625, + "completions/mean_terminated_length": 501.65625, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.7546827794561933, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03590701147913933, + "learning_rate": 7.224841350513899e-07, + "loss": 0.0259, + "num_tokens": 211442545.0, + "reward": 5.716301918029785, + "reward_std": 1.8096117973327637, + "rewards/accuracy_reward/mean": 4.966301918029785, + "rewards/accuracy_reward/std": 3.4535152912139893, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 559.34375, + "completions/mean_terminated_length": 559.34375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.7552870090634441, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04386342316865921, + "learning_rate": 7.205256402539599e-07, + "loss": 0.0094, + "num_tokens": 211598055.0, + "reward": 4.582767009735107, + "reward_std": 1.677419662475586, + "rewards/accuracy_reward/mean": 3.8327670097351074, + "rewards/accuracy_reward/std": 3.6542675495147705, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 537.0625, + "completions/mean_terminated_length": 537.0625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.7558912386706949, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022007372230291367, + "learning_rate": 7.185708576613591e-07, + "loss": 0.0174, + "num_tokens": 211888123.0, + "reward": 4.823956489562988, + "reward_std": 0.7718133330345154, + "rewards/accuracy_reward/mean": 4.073956489562988, + "rewards/accuracy_reward/std": 3.737757682800293, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 897.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 506.9375, + "completions/mean_terminated_length": 506.9375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.7564954682779457, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05467948317527771, + "learning_rate": 7.166197950807453e-07, + "loss": 0.0218, + "num_tokens": 212006007.0, + "reward": 3.989971160888672, + "reward_std": 3.0910229682922363, + "rewards/accuracy_reward/mean": 3.239971160888672, + "rewards/accuracy_reward/std": 3.703343152999878, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 504.5625, + "completions/mean_terminated_length": 504.5625, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.7570996978851964, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04009352624416351, + "learning_rate": 7.146724603044202e-07, + "loss": -0.0012, + "num_tokens": 212179227.0, + "reward": 5.108343124389648, + "reward_std": 1.5329887866973877, + "rewards/accuracy_reward/mean": 4.362249374389648, + "rewards/accuracy_reward/std": 3.738870620727539, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 559.46875, + "completions/mean_terminated_length": 535.84130859375, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7577039274924471, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037222571671009064, + "learning_rate": 7.127288611097959e-07, + "loss": -0.0088, + "num_tokens": 212386041.0, + "reward": 4.0987372398376465, + "reward_std": 1.8687942028045654, + "rewards/accuracy_reward/mean": 3.3643627166748047, + "rewards/accuracy_reward/std": 3.7372851371765137, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1242.0, + "completions/mean_length": 884.390625, + "completions/mean_terminated_length": 741.4912109375, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.7583081570996979, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04627181962132454, + "learning_rate": 7.107890052593651e-07, + "loss": -0.1293, + "num_tokens": 212526738.0, + "reward": 2.7105045318603516, + "reward_std": 2.0411503314971924, + "rewards/accuracy_reward/mean": 2.0425357818603516, + "rewards/accuracy_reward/std": 3.532761335372925, + "rewards/tag_count_reward/mean": 0.66796875, + "rewards/tag_count_reward/std": 0.2359323352575302, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 551.4375, + "completions/mean_terminated_length": 551.4375, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.7589123867069486, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0016103615052998066, + "learning_rate": 7.088529005006714e-07, + "loss": -0.0008, + "num_tokens": 212684846.0, + "reward": 4.505377769470215, + "reward_std": 0.06357965618371964, + "rewards/accuracy_reward/mean": 3.755378246307373, + "rewards/accuracy_reward/std": 3.729259967803955, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 607.484375, + "completions/mean_terminated_length": 607.484375, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "epoch": 0.7595166163141994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04097560793161392, + "learning_rate": 7.069205545662752e-07, + "loss": -0.0132, + "num_tokens": 212865293.0, + "reward": 2.571657657623291, + "reward_std": 1.6637556552886963, + "rewards/accuracy_reward/mean": 1.8216577768325806, + "rewards/accuracy_reward/std": 3.2960634231567383, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 483.78125, + "completions/mean_terminated_length": 483.78125, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.7601208459214501, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04259340465068817, + "learning_rate": 7.049919751737263e-07, + "loss": 0.0088, + "num_tokens": 213063695.0, + "reward": 4.143270492553711, + "reward_std": 1.7062263488769531, + "rewards/accuracy_reward/mean": 3.393270492553711, + "rewards/accuracy_reward/std": 3.687469482421875, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 593.609375, + "completions/mean_terminated_length": 593.609375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.760725075528701, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05344085022807121, + "learning_rate": 7.030671700255297e-07, + "loss": -0.004, + "num_tokens": 213267350.0, + "reward": 5.9877610206604, + "reward_std": 2.770303249359131, + "rewards/accuracy_reward/mean": 5.237760543823242, + "rewards/accuracy_reward/std": 3.3575079441070557, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 493.09375, + "completions/mean_terminated_length": 493.09375, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.7613293051359517, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001245662453584373, + "learning_rate": 7.011461468091183e-07, + "loss": -0.0002, + "num_tokens": 213403660.0, + "reward": 4.5744781494140625, + "reward_std": 0.0544864796102047, + "rewards/accuracy_reward/mean": 3.8244781494140625, + "rewards/accuracy_reward/std": 3.646836996078491, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 588.671875, + "completions/mean_terminated_length": 588.671875, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.7619335347432025, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.002977146068587899, + "learning_rate": 6.992289131968194e-07, + "loss": 0.0013, + "num_tokens": 213573367.0, + "reward": 2.3624563217163086, + "reward_std": 0.13356152176856995, + "rewards/accuracy_reward/mean": 1.612456202507019, + "rewards/accuracy_reward/std": 3.4217207431793213, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 528.765625, + "completions/mean_terminated_length": 528.765625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.7625377643504532, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04337332770228386, + "learning_rate": 6.973154768458245e-07, + "loss": 0.0092, + "num_tokens": 213750232.0, + "reward": 2.7299482822418213, + "reward_std": 2.3632752895355225, + "rewards/accuracy_reward/mean": 1.9799485206604004, + "rewards/accuracy_reward/std": 3.318199872970581, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 567.375, + "completions/mean_terminated_length": 567.375, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 0.7631419939577039, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0358428917825222, + "learning_rate": 6.954058453981609e-07, + "loss": -0.0008, + "num_tokens": 213931488.0, + "reward": 5.712454795837402, + "reward_std": 1.420504093170166, + "rewards/accuracy_reward/mean": 4.962454795837402, + "rewards/accuracy_reward/std": 3.601793050765991, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 586.96875, + "completions/mean_terminated_length": 539.8386840820312, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.7637462235649547, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03560158237814903, + "learning_rate": 6.935000264806587e-07, + "loss": -0.0084, + "num_tokens": 214078462.0, + "reward": 3.62039852142334, + "reward_std": 1.5947295427322388, + "rewards/accuracy_reward/mean": 2.89383602142334, + "rewards/accuracy_reward/std": 3.9109320640563965, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 462.625, + "completions/mean_terminated_length": 462.625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.7643504531722054, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03487652167677879, + "learning_rate": 6.915980277049206e-07, + "loss": -0.0158, + "num_tokens": 214210470.0, + "reward": 5.385367393493652, + "reward_std": 2.111126184463501, + "rewards/accuracy_reward/mean": 4.635367393493652, + "rewards/accuracy_reward/std": 3.65360951423645, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 555.03125, + "completions/mean_terminated_length": 555.03125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.7649546827794562, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03307396546006203, + "learning_rate": 6.896998566672937e-07, + "loss": -0.0062, + "num_tokens": 214378792.0, + "reward": 2.0886733531951904, + "reward_std": 1.6564453840255737, + "rewards/accuracy_reward/mean": 1.33867347240448, + "rewards/accuracy_reward/std": 2.814652442932129, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 563.75, + "completions/mean_terminated_length": 563.75, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.7655589123867069, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03923119977116585, + "learning_rate": 6.878055209488363e-07, + "loss": 0.0088, + "num_tokens": 214619848.0, + "reward": 3.645054578781128, + "reward_std": 1.4200835227966309, + "rewards/accuracy_reward/mean": 2.895054578781128, + "rewards/accuracy_reward/std": 3.6445579528808594, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 587.21875, + "completions/mean_terminated_length": 587.21875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.7661631419939577, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02270517498254776, + "learning_rate": 6.85915028115289e-07, + "loss": 0.0111, + "num_tokens": 214786326.0, + "reward": 4.42326545715332, + "reward_std": 0.49396196007728577, + "rewards/accuracy_reward/mean": 3.6732656955718994, + "rewards/accuracy_reward/std": 3.621901035308838, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 643.578125, + "completions/mean_terminated_length": 643.578125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.7667673716012084, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.020269107073545456, + "learning_rate": 6.840283857170452e-07, + "loss": 0.005, + "num_tokens": 214969723.0, + "reward": 4.830559730529785, + "reward_std": 0.9225589036941528, + "rewards/accuracy_reward/mean": 4.080559253692627, + "rewards/accuracy_reward/std": 3.616838216781616, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 596.53125, + "completions/mean_terminated_length": 596.53125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.7673716012084593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.018020380288362503, + "learning_rate": 6.821456012891194e-07, + "loss": 0.0058, + "num_tokens": 215117661.0, + "reward": 6.24558162689209, + "reward_std": 0.5340880751609802, + "rewards/accuracy_reward/mean": 5.495581150054932, + "rewards/accuracy_reward/std": 3.1930322647094727, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 531.96875, + "completions/mean_terminated_length": 531.96875, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.76797583081571, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04743003472685814, + "learning_rate": 6.802666823511185e-07, + "loss": -0.0056, + "num_tokens": 215323947.0, + "reward": 3.5564703941345215, + "reward_std": 2.3154027462005615, + "rewards/accuracy_reward/mean": 2.8064703941345215, + "rewards/accuracy_reward/std": 3.6246118545532227, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 528.359375, + "completions/mean_terminated_length": 528.359375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.7685800604229607, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055186327546834946, + "learning_rate": 6.783916364072101e-07, + "loss": -0.0003, + "num_tokens": 215521026.0, + "reward": 3.596531391143799, + "reward_std": 2.887460947036743, + "rewards/accuracy_reward/mean": 2.846531391143799, + "rewards/accuracy_reward/std": 3.6863605976104736, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 618.625, + "completions/mean_terminated_length": 618.625, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.7691842900302115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05310048907995224, + "learning_rate": 6.765204709460949e-07, + "loss": 0.014, + "num_tokens": 215698954.0, + "reward": 5.654379844665527, + "reward_std": 2.8216018676757812, + "rewards/accuracy_reward/mean": 4.904379844665527, + "rewards/accuracy_reward/std": 3.526113748550415, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 561.328125, + "completions/mean_terminated_length": 537.7301635742188, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.7697885196374622, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03186532109975815, + "learning_rate": 6.746531934409743e-07, + "loss": -0.0356, + "num_tokens": 215859007.0, + "reward": 1.966967225074768, + "reward_std": 1.4806866645812988, + "rewards/accuracy_reward/mean": 1.228685975074768, + "rewards/accuracy_reward/std": 2.7146518230438232, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 536.453125, + "completions/mean_terminated_length": 536.453125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.770392749244713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.043952107429504395, + "learning_rate": 6.727898113495217e-07, + "loss": -0.006, + "num_tokens": 216012684.0, + "reward": 3.865321636199951, + "reward_std": 2.2373976707458496, + "rewards/accuracy_reward/mean": 3.1153221130371094, + "rewards/accuracy_reward/std": 3.743633270263672, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1008.0, + "completions/max_terminated_length": 1008.0, + "completions/mean_length": 552.59375, + "completions/mean_terminated_length": 552.59375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.7709969788519637, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.017428454011678696, + "learning_rate": 6.709303321138539e-07, + "loss": -0.0037, + "num_tokens": 216196322.0, + "reward": 0.9181421995162964, + "reward_std": 0.7420403361320496, + "rewards/accuracy_reward/mean": 0.1681421995162964, + "rewards/accuracy_reward/std": 1.3281829357147217, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 543.9375, + "completions/mean_terminated_length": 543.9375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.7716012084592145, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0334259457886219, + "learning_rate": 6.690747631604989e-07, + "loss": 0.0027, + "num_tokens": 216376958.0, + "reward": 5.956989288330078, + "reward_std": 1.1654996871948242, + "rewards/accuracy_reward/mean": 5.20698881149292, + "rewards/accuracy_reward/std": 3.447232484817505, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 608.828125, + "completions/mean_terminated_length": 585.984130859375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.7722054380664652, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.038249220699071884, + "learning_rate": 6.672231119003683e-07, + "loss": 0.0019, + "num_tokens": 216521731.0, + "reward": 4.593306541442871, + "reward_std": 1.7928123474121094, + "rewards/accuracy_reward/mean": 3.855024814605713, + "rewards/accuracy_reward/std": 3.7012572288513184, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 459.703125, + "completions/mean_terminated_length": 459.703125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.7728096676737161, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027287933975458145, + "learning_rate": 6.653753857287258e-07, + "loss": 0.0039, + "num_tokens": 216673120.0, + "reward": 7.716835975646973, + "reward_std": 0.8664618730545044, + "rewards/accuracy_reward/mean": 6.970742225646973, + "rewards/accuracy_reward/std": 1.8147931098937988, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 582.484375, + "completions/mean_terminated_length": 582.484375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.7734138972809668, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05859467014670372, + "learning_rate": 6.635315920251606e-07, + "loss": -0.0401, + "num_tokens": 216814543.0, + "reward": 4.445479869842529, + "reward_std": 2.7966716289520264, + "rewards/accuracy_reward/mean": 3.6954798698425293, + "rewards/accuracy_reward/std": 3.6163151264190674, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 517.0625, + "completions/mean_terminated_length": 517.0625, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.7740181268882175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03890800103545189, + "learning_rate": 6.616917381535547e-07, + "loss": 0.0057, + "num_tokens": 216943923.0, + "reward": 6.896829128265381, + "reward_std": 1.5973052978515625, + "rewards/accuracy_reward/mean": 6.146829605102539, + "rewards/accuracy_reward/std": 2.8227763175964355, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 645.0625, + "completions/mean_terminated_length": 645.0625, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.7746223564954683, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03669075667858124, + "learning_rate": 6.598558314620549e-07, + "loss": 0.0079, + "num_tokens": 217086119.0, + "reward": 3.8265719413757324, + "reward_std": 1.366403341293335, + "rewards/accuracy_reward/mean": 3.0765719413757324, + "rewards/accuracy_reward/std": 3.638679027557373, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 740.78125, + "completions/mean_terminated_length": 720.0317993164062, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.775226586102719, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053295355290174484, + "learning_rate": 6.580238792830447e-07, + "loss": -0.0382, + "num_tokens": 217254201.0, + "reward": 3.7462282180786133, + "reward_std": 2.714010000228882, + "rewards/accuracy_reward/mean": 3.0079469680786133, + "rewards/accuracy_reward/std": 3.5599617958068848, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 614.40625, + "completions/mean_terminated_length": 614.40625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.7758308157099698, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03417164459824562, + "learning_rate": 6.561958889331121e-07, + "loss": 0.0091, + "num_tokens": 217386691.0, + "reward": 3.739103078842163, + "reward_std": 1.0551178455352783, + "rewards/accuracy_reward/mean": 2.989103078842163, + "rewards/accuracy_reward/std": 3.595879554748535, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 406.296875, + "completions/mean_terminated_length": 406.296875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.7764350453172205, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024742860347032547, + "learning_rate": 6.543718677130238e-07, + "loss": 0.006, + "num_tokens": 217560614.0, + "reward": 6.043149948120117, + "reward_std": 0.9989257454872131, + "rewards/accuracy_reward/mean": 5.293149948120117, + "rewards/accuracy_reward/std": 3.3750829696655273, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1568.0, + "completions/mean_length": 836.5625, + "completions/mean_terminated_length": 817.3333740234375, + "completions/min_length": 544.0, + "completions/min_terminated_length": 544.0, + "epoch": 0.7770392749244713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04234333708882332, + "learning_rate": 6.525518229076924e-07, + "loss": -0.0182, + "num_tokens": 217745498.0, + "reward": 4.157371520996094, + "reward_std": 1.6885672807693481, + "rewards/accuracy_reward/mean": 3.419090747833252, + "rewards/accuracy_reward/std": 3.7106895446777344, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 572.53125, + "completions/mean_terminated_length": 549.1111450195312, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.777643504531722, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055668871849775314, + "learning_rate": 6.507357617861512e-07, + "loss": -0.0313, + "num_tokens": 217942780.0, + "reward": 3.632032871246338, + "reward_std": 2.5226285457611084, + "rewards/accuracy_reward/mean": 2.897657871246338, + "rewards/accuracy_reward/std": 3.6917614936828613, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 485.578125, + "completions/mean_terminated_length": 485.578125, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7782477341389729, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027035225182771683, + "learning_rate": 6.489236916015213e-07, + "loss": -0.0072, + "num_tokens": 218154961.0, + "reward": 3.9772090911865234, + "reward_std": 0.891127347946167, + "rewards/accuracy_reward/mean": 3.2272093296051025, + "rewards/accuracy_reward/std": 3.7082173824310303, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1731.0, + "completions/max_terminated_length": 1731.0, + "completions/mean_length": 750.921875, + "completions/mean_terminated_length": 750.921875, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.7788519637462236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025593766942620277, + "learning_rate": 6.471156195909854e-07, + "loss": -0.0075, + "num_tokens": 218357100.0, + "reward": 4.211240768432617, + "reward_std": 1.1707972288131714, + "rewards/accuracy_reward/mean": 3.461240530014038, + "rewards/accuracy_reward/std": 3.6056551933288574, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 550.5625, + "completions/mean_terminated_length": 550.5625, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.7794561933534743, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.041547372937202454, + "learning_rate": 6.453115529757584e-07, + "loss": -0.005, + "num_tokens": 218515184.0, + "reward": 3.3954453468322754, + "reward_std": 1.0432984828948975, + "rewards/accuracy_reward/mean": 2.6454453468322754, + "rewards/accuracy_reward/std": 3.6298415660858154, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 465.28125, + "completions/mean_terminated_length": 465.28125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.7800604229607251, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.012342631816864014, + "learning_rate": 6.435114989610574e-07, + "loss": 0.0007, + "num_tokens": 218662194.0, + "reward": 2.736893653869629, + "reward_std": 0.5052832961082458, + "rewards/accuracy_reward/mean": 1.990799903869629, + "rewards/accuracy_reward/std": 3.2881414890289307, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.0, + "completions/max_terminated_length": 1239.0, + "completions/mean_length": 699.046875, + "completions/mean_terminated_length": 699.046875, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.7806646525679758, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029261518269777298, + "learning_rate": 6.417154647360738e-07, + "loss": 0.0044, + "num_tokens": 218833445.0, + "reward": 4.124260902404785, + "reward_std": 0.9347842335700989, + "rewards/accuracy_reward/mean": 3.3742611408233643, + "rewards/accuracy_reward/std": 3.724095582962036, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 771.0, + "completions/max_terminated_length": 771.0, + "completions/mean_length": 596.171875, + "completions/mean_terminated_length": 596.171875, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.7812688821752266, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023305702954530716, + "learning_rate": 6.39923457473945e-07, + "loss": 0.0078, + "num_tokens": 218987136.0, + "reward": 4.213541030883789, + "reward_std": 0.6896659135818481, + "rewards/accuracy_reward/mean": 3.463540554046631, + "rewards/accuracy_reward/std": 3.7362868785858154, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 561.21875, + "completions/mean_terminated_length": 561.21875, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.7818731117824773, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04893454909324646, + "learning_rate": 6.381354843317245e-07, + "loss": 0.0104, + "num_tokens": 219121390.0, + "reward": 5.084853172302246, + "reward_std": 1.7807481288909912, + "rewards/accuracy_reward/mean": 4.334853172302246, + "rewards/accuracy_reward/std": 3.6670310497283936, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 508.828125, + "completions/mean_terminated_length": 508.828125, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.7824773413897281, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03695017099380493, + "learning_rate": 6.363515524503539e-07, + "loss": 0.0099, + "num_tokens": 219253955.0, + "reward": 4.996479511260986, + "reward_std": 1.3960429430007935, + "rewards/accuracy_reward/mean": 4.246479511260986, + "rewards/accuracy_reward/std": 3.6750149726867676, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 583.484375, + "completions/mean_terminated_length": 583.484375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.7830815709969788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05796482414007187, + "learning_rate": 6.345716689546361e-07, + "loss": 0.0249, + "num_tokens": 219400466.0, + "reward": 4.750607967376709, + "reward_std": 3.294689655303955, + "rewards/accuracy_reward/mean": 4.000607967376709, + "rewards/accuracy_reward/std": 3.6592628955841064, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 540.734375, + "completions/mean_terminated_length": 540.734375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "epoch": 0.7836858006042297, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02673017419874668, + "learning_rate": 6.32795840953203e-07, + "loss": 0.0114, + "num_tokens": 219557153.0, + "reward": 4.802432537078857, + "reward_std": 0.8523476123809814, + "rewards/accuracy_reward/mean": 4.056339263916016, + "rewards/accuracy_reward/std": 3.7653467655181885, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1301.0, + "completions/max_terminated_length": 1301.0, + "completions/mean_length": 617.390625, + "completions/mean_terminated_length": 617.390625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.7842900302114804, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03412773460149765, + "learning_rate": 6.310240755384911e-07, + "loss": 0.0119, + "num_tokens": 219742826.0, + "reward": 2.8669674396514893, + "reward_std": 1.3312830924987793, + "rewards/accuracy_reward/mean": 2.11696720123291, + "rewards/accuracy_reward/std": 3.661231517791748, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 587.375, + "completions/mean_terminated_length": 587.375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.7848942598187311, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01674610748887062, + "learning_rate": 6.292563797867104e-07, + "loss": 0.0118, + "num_tokens": 219916642.0, + "reward": 3.0334115028381348, + "reward_std": 0.7335650324821472, + "rewards/accuracy_reward/mean": 2.2834112644195557, + "rewards/accuracy_reward/std": 3.244194269180298, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1360.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 552.5, + "completions/mean_terminated_length": 552.5, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.7854984894259819, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.017143800854682922, + "learning_rate": 6.274927607578182e-07, + "loss": 0.001, + "num_tokens": 220082418.0, + "reward": 4.28641414642334, + "reward_std": 0.571631908416748, + "rewards/accuracy_reward/mean": 3.53641414642334, + "rewards/accuracy_reward/std": 3.777895212173462, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 883.0, + "completions/max_terminated_length": 883.0, + "completions/mean_length": 628.03125, + "completions/mean_terminated_length": 628.03125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.7861027190332326, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.025949399918317795, + "learning_rate": 6.257332254954888e-07, + "loss": 0.0051, + "num_tokens": 220240644.0, + "reward": 4.111674785614014, + "reward_std": 0.9194079041481018, + "rewards/accuracy_reward/mean": 3.3616750240325928, + "rewards/accuracy_reward/std": 3.645089626312256, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 687.640625, + "completions/mean_terminated_length": 666.0476684570312, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.7867069486404834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01704312488436699, + "learning_rate": 6.239777810270865e-07, + "loss": -0.0168, + "num_tokens": 220397117.0, + "reward": 6.431288719177246, + "reward_std": 0.5005779266357422, + "rewards/accuracy_reward/mean": 5.693007946014404, + "rewards/accuracy_reward/std": 3.2142996788024902, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 564.015625, + "completions/mean_terminated_length": 564.015625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.7873111782477341, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04764117673039436, + "learning_rate": 6.222264343636387e-07, + "loss": 0.0153, + "num_tokens": 220529998.0, + "reward": 6.068654537200928, + "reward_std": 2.299055576324463, + "rewards/accuracy_reward/mean": 5.318655014038086, + "rewards/accuracy_reward/std": 3.3303616046905518, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 543.375, + "completions/mean_terminated_length": 543.375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.7879154078549849, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.023319119587540627, + "learning_rate": 6.204791924998055e-07, + "loss": 0.0182, + "num_tokens": 220676150.0, + "reward": 4.364365577697754, + "reward_std": 0.4694589376449585, + "rewards/accuracy_reward/mean": 3.614365577697754, + "rewards/accuracy_reward/std": 3.7586827278137207, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1247.0, + "completions/max_terminated_length": 1247.0, + "completions/mean_length": 585.328125, + "completions/mean_terminated_length": 585.328125, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.7885196374622356, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05638190358877182, + "learning_rate": 6.187360624138527e-07, + "loss": 0.01, + "num_tokens": 220864459.0, + "reward": 1.9671437740325928, + "reward_std": 1.7749847173690796, + "rewards/accuracy_reward/mean": 1.2171437740325928, + "rewards/accuracy_reward/std": 3.04620623588562, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 442.625, + "completions/mean_terminated_length": 442.625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.7891238670694865, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02917945384979248, + "learning_rate": 6.169970510676258e-07, + "loss": -0.0128, + "num_tokens": 220992723.0, + "reward": 7.488525867462158, + "reward_std": 1.372849702835083, + "rewards/accuracy_reward/mean": 6.738525390625, + "rewards/accuracy_reward/std": 2.185150623321533, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 543.765625, + "completions/mean_terminated_length": 543.765625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.7897280966767372, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03150317817926407, + "learning_rate": 6.15262165406519e-07, + "loss": -0.0008, + "num_tokens": 221124868.0, + "reward": 5.730540752410889, + "reward_std": 1.364458680152893, + "rewards/accuracy_reward/mean": 4.980540752410889, + "rewards/accuracy_reward/std": 3.49283766746521, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 469.703125, + "completions/mean_terminated_length": 469.703125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.7903323262839879, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.012288222089409828, + "learning_rate": 6.135314123594495e-07, + "loss": -0.002, + "num_tokens": 221261569.0, + "reward": 4.338804244995117, + "reward_std": 0.47465020418167114, + "rewards/accuracy_reward/mean": 3.5888047218322754, + "rewards/accuracy_reward/std": 3.732133150100708, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 569.3125, + "completions/mean_terminated_length": 569.3125, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.7909365558912387, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.020042002201080322, + "learning_rate": 6.118047988388293e-07, + "loss": 0.0008, + "num_tokens": 221578533.0, + "reward": 2.4647140502929688, + "reward_std": 0.5240023136138916, + "rewards/accuracy_reward/mean": 1.7147140502929688, + "rewards/accuracy_reward/std": 3.1633400917053223, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1240.0, + "completions/max_terminated_length": 1240.0, + "completions/mean_length": 607.6875, + "completions/mean_terminated_length": 607.6875, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.7915407854984894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.060450199991464615, + "learning_rate": 6.100823317405381e-07, + "loss": 0.0426, + "num_tokens": 221751953.0, + "reward": 3.143401622772217, + "reward_std": 2.8721046447753906, + "rewards/accuracy_reward/mean": 2.393401622772217, + "rewards/accuracy_reward/std": 3.54443097114563, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1231.0, + "completions/max_terminated_length": 1231.0, + "completions/mean_length": 594.640625, + "completions/mean_terminated_length": 594.640625, + "completions/min_length": 380.0, + "completions/min_terminated_length": 380.0, + "epoch": 0.7921450151057402, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05682096630334854, + "learning_rate": 6.083640179438946e-07, + "loss": 0.0175, + "num_tokens": 221891946.0, + "reward": 3.3474373817443848, + "reward_std": 2.406282424926758, + "rewards/accuracy_reward/mean": 2.5974373817443848, + "rewards/accuracy_reward/std": 3.6001553535461426, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 502.140625, + "completions/mean_terminated_length": 502.140625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.7927492447129909, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0483490526676178, + "learning_rate": 6.066498643116301e-07, + "loss": 0.0051, + "num_tokens": 222082691.0, + "reward": 6.918656349182129, + "reward_std": 2.349588632583618, + "rewards/accuracy_reward/mean": 6.168656349182129, + "rewards/accuracy_reward/std": 2.8213019371032715, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 566.296875, + "completions/mean_terminated_length": 566.296875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.7933534743202417, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04965054616332054, + "learning_rate": 6.049398776898614e-07, + "loss": 0.0365, + "num_tokens": 222214630.0, + "reward": 4.036679744720459, + "reward_std": 2.294372797012329, + "rewards/accuracy_reward/mean": 3.28667950630188, + "rewards/accuracy_reward/std": 3.673508405685425, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 517.78125, + "completions/mean_terminated_length": 517.78125, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.7939577039274924, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029149023815989494, + "learning_rate": 6.032340649080617e-07, + "loss": -0.0087, + "num_tokens": 222385144.0, + "reward": 5.862238883972168, + "reward_std": 0.8458971977233887, + "rewards/accuracy_reward/mean": 5.112238883972168, + "rewards/accuracy_reward/std": 3.474165916442871, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1783.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 623.3125, + "completions/mean_terminated_length": 623.3125, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.7945619335347432, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03064817562699318, + "learning_rate": 6.015324327790345e-07, + "loss": -0.0064, + "num_tokens": 222625868.0, + "reward": 3.6295998096466064, + "reward_std": 1.0399240255355835, + "rewards/accuracy_reward/mean": 2.8796000480651855, + "rewards/accuracy_reward/std": 3.6885876655578613, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1530.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 567.359375, + "completions/mean_terminated_length": 567.359375, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.795166163141994, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.026701919734477997, + "learning_rate": 5.998349880988866e-07, + "loss": 0.0218, + "num_tokens": 222766643.0, + "reward": 2.2420578002929688, + "reward_std": 0.74031662940979, + "rewards/accuracy_reward/mean": 1.4920578002929688, + "rewards/accuracy_reward/std": 2.978691339492798, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 715.0, + "completions/mean_terminated_length": 715.0, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.7957703927492447, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04208563268184662, + "learning_rate": 5.981417376470011e-07, + "loss": -0.0268, + "num_tokens": 222926883.0, + "reward": 6.944573402404785, + "reward_std": 2.1801323890686035, + "rewards/accuracy_reward/mean": 6.194573402404785, + "rewards/accuracy_reward/std": 2.790257453918457, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 519.671875, + "completions/mean_terminated_length": 519.671875, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.7963746223564955, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.015389185398817062, + "learning_rate": 5.964526881860091e-07, + "loss": 0.0088, + "num_tokens": 223095102.0, + "reward": 2.6661453247070312, + "reward_std": 0.49352940917015076, + "rewards/accuracy_reward/mean": 1.9161453247070312, + "rewards/accuracy_reward/std": 3.362583875656128, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1288.0, + "completions/max_terminated_length": 1288.0, + "completions/mean_length": 596.890625, + "completions/mean_terminated_length": 596.890625, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.7969788519637462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029034851118922234, + "learning_rate": 5.947678464617634e-07, + "loss": 0.0056, + "num_tokens": 223247767.0, + "reward": 4.677087783813477, + "reward_std": 1.383829116821289, + "rewards/accuracy_reward/mean": 3.9270873069763184, + "rewards/accuracy_reward/std": 3.599578857421875, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 537.546875, + "completions/mean_terminated_length": 537.546875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.797583081570997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04363923519849777, + "learning_rate": 5.93087219203313e-07, + "loss": -0.0102, + "num_tokens": 223457050.0, + "reward": 6.098133087158203, + "reward_std": 1.8736594915390015, + "rewards/accuracy_reward/mean": 5.348133087158203, + "rewards/accuracy_reward/std": 3.409271240234375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1108.0, + "completions/mean_length": 588.921875, + "completions/mean_terminated_length": 565.761962890625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.7981873111782477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03231072053313255, + "learning_rate": 5.91410813122873e-07, + "loss": -0.037, + "num_tokens": 223617093.0, + "reward": 4.321696758270264, + "reward_std": 1.4940463304519653, + "rewards/accuracy_reward/mean": 3.5834155082702637, + "rewards/accuracy_reward/std": 3.77990460395813, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 590.890625, + "completions/mean_terminated_length": 567.761962890625, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "epoch": 0.7987915407854985, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016463985666632652, + "learning_rate": 5.897386349158007e-07, + "loss": -0.0116, + "num_tokens": 223759406.0, + "reward": 4.5294671058654785, + "reward_std": 0.5931648015975952, + "rewards/accuracy_reward/mean": 3.7911860942840576, + "rewards/accuracy_reward/std": 3.737675428390503, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 797.0, + "completions/max_terminated_length": 797.0, + "completions/mean_length": 561.734375, + "completions/mean_terminated_length": 561.734375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "epoch": 0.7993957703927492, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.017079543322324753, + "learning_rate": 5.88070691260568e-07, + "loss": 0.0098, + "num_tokens": 223926125.0, + "reward": 4.103688716888428, + "reward_std": 0.4721722900867462, + "rewards/accuracy_reward/mean": 3.353689193725586, + "rewards/accuracy_reward/std": 4.008139133453369, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 970.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 677.203125, + "completions/mean_terminated_length": 677.203125, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "epoch": 0.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03825846686959267, + "learning_rate": 5.864069888187332e-07, + "loss": -0.0144, + "num_tokens": 224088170.0, + "reward": 3.320485830307007, + "reward_std": 1.544992208480835, + "rewards/accuracy_reward/mean": 2.570485830307007, + "rewards/accuracy_reward/std": 3.570261001586914, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 559.875, + "completions/mean_terminated_length": 559.875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "epoch": 0.8006042296072508, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0003315791254863143, + "learning_rate": 5.847475342349178e-07, + "loss": 0.0, + "num_tokens": 224266114.0, + "reward": 4.48452615737915, + "reward_std": 0.013607176020741463, + "rewards/accuracy_reward/mean": 3.7345261573791504, + "rewards/accuracy_reward/std": 3.764096260070801, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 804.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 538.9375, + "completions/mean_terminated_length": 538.9375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.8012084592145015, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04516584426164627, + "learning_rate": 5.830923341367757e-07, + "loss": 0.0011, + "num_tokens": 224459086.0, + "reward": 2.207204818725586, + "reward_std": 1.8869632482528687, + "rewards/accuracy_reward/mean": 1.457204818725586, + "rewards/accuracy_reward/std": 2.903937578201294, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 893.0, + "completions/max_terminated_length": 893.0, + "completions/mean_length": 596.28125, + "completions/mean_terminated_length": 596.28125, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "epoch": 0.8018126888217523, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04705860838294029, + "learning_rate": 5.814413951349705e-07, + "loss": -0.0085, + "num_tokens": 224639280.0, + "reward": 3.230257511138916, + "reward_std": 1.9181288480758667, + "rewards/accuracy_reward/mean": 2.480257511138916, + "rewards/accuracy_reward/std": 3.5099973678588867, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 613.375, + "completions/mean_terminated_length": 613.375, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "epoch": 0.802416918429003, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004328933544456959, + "learning_rate": 5.797947238231473e-07, + "loss": -0.0019, + "num_tokens": 224780344.0, + "reward": 4.418056488037109, + "reward_std": 0.12490049749612808, + "rewards/accuracy_reward/mean": 3.6680562496185303, + "rewards/accuracy_reward/std": 3.8303897380828857, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 537.140625, + "completions/mean_terminated_length": 537.140625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.8030211480362538, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04215463995933533, + "learning_rate": 5.781523267779052e-07, + "loss": 0.0009, + "num_tokens": 224946049.0, + "reward": 4.468451499938965, + "reward_std": 1.849932312965393, + "rewards/accuracy_reward/mean": 3.718451499938965, + "rewards/accuracy_reward/std": 3.747950792312622, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 415.28125, + "completions/mean_terminated_length": 415.28125, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.8036253776435045, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03182701766490936, + "learning_rate": 5.765142105587744e-07, + "loss": -0.0185, + "num_tokens": 225074835.0, + "reward": 4.7711310386657715, + "reward_std": 1.362401008605957, + "rewards/accuracy_reward/mean": 4.0211310386657715, + "rewards/accuracy_reward/std": 3.723100185394287, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 493.734375, + "completions/mean_terminated_length": 493.734375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.8042296072507553, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004354151431471109, + "learning_rate": 5.748803817081868e-07, + "loss": 0.0008, + "num_tokens": 225196546.0, + "reward": 4.539013862609863, + "reward_std": 0.1253422498703003, + "rewards/accuracy_reward/mean": 3.7890143394470215, + "rewards/accuracy_reward/std": 3.6724348068237305, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 600.453125, + "completions/mean_terminated_length": 600.453125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.804833836858006, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00038351057446561754, + "learning_rate": 5.732508467514508e-07, + "loss": -0.0004, + "num_tokens": 225349503.0, + "reward": 6.306509017944336, + "reward_std": 0.023966249078512192, + "rewards/accuracy_reward/mean": 5.556509017944336, + "rewards/accuracy_reward/std": 3.233961820602417, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 547.359375, + "completions/mean_terminated_length": 547.359375, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.8054380664652568, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02811517007648945, + "learning_rate": 5.716256121967267e-07, + "loss": 0.0242, + "num_tokens": 225543174.0, + "reward": 5.8710527420043945, + "reward_std": 1.225536584854126, + "rewards/accuracy_reward/mean": 5.121053218841553, + "rewards/accuracy_reward/std": 3.480048179626465, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 804.0, + "completions/mean_length": 566.609375, + "completions/mean_terminated_length": 543.0952758789062, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.8060422960725075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053324680775403976, + "learning_rate": 5.700046845349988e-07, + "loss": -0.012, + "num_tokens": 225750189.0, + "reward": 5.104206085205078, + "reward_std": 2.3275327682495117, + "rewards/accuracy_reward/mean": 4.369831085205078, + "rewards/accuracy_reward/std": 3.745098829269409, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 814.0, + "completions/max_terminated_length": 814.0, + "completions/mean_length": 542.65625, + "completions/mean_terminated_length": 542.65625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.8066465256797583, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016752462834119797, + "learning_rate": 5.683880702400496e-07, + "loss": 0.0042, + "num_tokens": 225933159.0, + "reward": 8.086071014404297, + "reward_std": 0.5493027567863464, + "rewards/accuracy_reward/mean": 7.3360700607299805, + "rewards/accuracy_reward/std": 1.0589282512664795, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 809.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 554.609375, + "completions/mean_terminated_length": 554.609375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.8072507552870091, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018109219148755074, + "learning_rate": 5.667757757684366e-07, + "loss": 0.0126, + "num_tokens": 226107598.0, + "reward": 6.231122016906738, + "reward_std": 0.48676592111587524, + "rewards/accuracy_reward/mean": 5.481122016906738, + "rewards/accuracy_reward/std": 3.3226442337036133, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 578.5, + "completions/mean_terminated_length": 578.5, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.8078549848942598, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.022449038922786713, + "learning_rate": 5.65167807559462e-07, + "loss": -0.0068, + "num_tokens": 226354542.0, + "reward": 2.300995349884033, + "reward_std": 0.7333595156669617, + "rewards/accuracy_reward/mean": 1.5509953498840332, + "rewards/accuracy_reward/std": 3.021475076675415, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1292.0, + "completions/max_terminated_length": 1292.0, + "completions/mean_length": 638.234375, + "completions/mean_terminated_length": 638.234375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.8084592145015106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04832478612661362, + "learning_rate": 5.635641720351505e-07, + "loss": 0.0017, + "num_tokens": 226526285.0, + "reward": 3.300011396408081, + "reward_std": 2.3988733291625977, + "rewards/accuracy_reward/mean": 2.550011157989502, + "rewards/accuracy_reward/std": 3.554650068283081, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 536.90625, + "completions/mean_terminated_length": 536.90625, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.8090634441087613, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.027713323011994362, + "learning_rate": 5.619648756002232e-07, + "loss": 0.0088, + "num_tokens": 226730903.0, + "reward": 2.9321155548095703, + "reward_std": 0.9624354839324951, + "rewards/accuracy_reward/mean": 2.1821157932281494, + "rewards/accuracy_reward/std": 3.7392899990081787, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 421.9375, + "completions/mean_terminated_length": 421.9375, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.8096676737160121, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02047797664999962, + "learning_rate": 5.603699246420711e-07, + "loss": -0.0107, + "num_tokens": 226881203.0, + "reward": 7.884159564971924, + "reward_std": 0.6479955315589905, + "rewards/accuracy_reward/mean": 7.134159564971924, + "rewards/accuracy_reward/std": 1.2931275367736816, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 541.390625, + "completions/mean_terminated_length": 541.390625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.8102719033232628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.003903940785676241, + "learning_rate": 5.587793255307292e-07, + "loss": 0.0, + "num_tokens": 227020300.0, + "reward": 4.540060997009277, + "reward_std": 0.11912976950407028, + "rewards/accuracy_reward/mean": 3.7900609970092773, + "rewards/accuracy_reward/std": 3.6622185707092285, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 512.140625, + "completions/mean_terminated_length": 512.140625, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.8108761329305136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03251628577709198, + "learning_rate": 5.571930846188524e-07, + "loss": 0.0086, + "num_tokens": 227224741.0, + "reward": 7.976128101348877, + "reward_std": 0.9446902871131897, + "rewards/accuracy_reward/mean": 7.226128578186035, + "rewards/accuracy_reward/std": 1.308491587638855, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 555.15625, + "completions/mean_terminated_length": 555.15625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.8114803625377643, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04267873242497444, + "learning_rate": 5.556112082416889e-07, + "loss": -0.0477, + "num_tokens": 227379727.0, + "reward": 6.378602981567383, + "reward_std": 2.140669107437134, + "rewards/accuracy_reward/mean": 5.628602981567383, + "rewards/accuracy_reward/std": 3.1162617206573486, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 859.0, + "completions/max_terminated_length": 859.0, + "completions/mean_length": 515.125, + "completions/mean_terminated_length": 515.125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.8120845921450152, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055275700986385345, + "learning_rate": 5.540337027170566e-07, + "loss": 0.0177, + "num_tokens": 227606103.0, + "reward": 6.846523284912109, + "reward_std": 1.806631326675415, + "rewards/accuracy_reward/mean": 6.096523284912109, + "rewards/accuracy_reward/std": 2.8441531658172607, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 543.484375, + "completions/mean_terminated_length": 543.484375, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.8126888217522659, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0629369467496872, + "learning_rate": 5.524605743453159e-07, + "loss": -0.0322, + "num_tokens": 227786470.0, + "reward": 3.776132583618164, + "reward_std": 1.425851821899414, + "rewards/accuracy_reward/mean": 3.026132583618164, + "rewards/accuracy_reward/std": 3.6873881816864014, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1051.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 576.21875, + "completions/mean_terminated_length": 576.21875, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.8132930513595166, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02875826135277748, + "learning_rate": 5.508918294093451e-07, + "loss": 0.0009, + "num_tokens": 227944404.0, + "reward": 2.709764003753662, + "reward_std": 1.1529247760772705, + "rewards/accuracy_reward/mean": 1.959764003753662, + "rewards/accuracy_reward/std": 3.322451114654541, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 539.84375, + "completions/mean_terminated_length": 539.84375, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.8138972809667674, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.00028922545607201755, + "learning_rate": 5.493274741745169e-07, + "loss": -0.0, + "num_tokens": 228128378.0, + "reward": 2.612370491027832, + "reward_std": 0.008264416828751564, + "rewards/accuracy_reward/mean": 1.862370252609253, + "rewards/accuracy_reward/std": 3.251260280609131, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1716.0, + "completions/mean_length": 689.890625, + "completions/mean_terminated_length": 646.0806274414062, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.8145015105740181, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06496655941009521, + "learning_rate": 5.477675148886707e-07, + "loss": -0.0457, + "num_tokens": 228285187.0, + "reward": 4.030643939971924, + "reward_std": 3.0337352752685547, + "rewards/accuracy_reward/mean": 3.3079872131347656, + "rewards/accuracy_reward/std": 3.7969048023223877, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.13449780642986298, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1252.0, + "completions/mean_length": 684.65625, + "completions/mean_terminated_length": 640.6774291992188, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.8151057401812689, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03730182722210884, + "learning_rate": 5.462119577820897e-07, + "loss": -0.0211, + "num_tokens": 228538733.0, + "reward": 0.9186625480651855, + "reward_std": 1.0207207202911377, + "rewards/accuracy_reward/mean": 0.19210001826286316, + "rewards/accuracy_reward/std": 1.6586520671844482, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1943.0, + "completions/mean_length": 847.375, + "completions/mean_terminated_length": 828.3175048828125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.8157099697885196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030091632157564163, + "learning_rate": 5.446608090674754e-07, + "loss": 0.0045, + "num_tokens": 228696917.0, + "reward": 5.8765950202941895, + "reward_std": 0.9591089487075806, + "rewards/accuracy_reward/mean": 5.1305012702941895, + "rewards/accuracy_reward/std": 3.565650701522827, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 418.578125, + "completions/mean_terminated_length": 418.578125, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.8163141993957704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03479243442416191, + "learning_rate": 5.431140749399226e-07, + "loss": 0.0036, + "num_tokens": 228847466.0, + "reward": 4.188208103179932, + "reward_std": 1.9131735563278198, + "rewards/accuracy_reward/mean": 3.4382081031799316, + "rewards/accuracy_reward/std": 3.954225778579712, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 478.8125, + "completions/mean_terminated_length": 478.8125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.8169184290030211, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.040126677602529526, + "learning_rate": 5.415717615768941e-07, + "loss": 0.0164, + "num_tokens": 229080622.0, + "reward": 3.353818655014038, + "reward_std": 1.429055094718933, + "rewards/accuracy_reward/mean": 2.603818893432617, + "rewards/accuracy_reward/std": 3.504404067993164, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 485.34375, + "completions/mean_terminated_length": 485.34375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.817522658610272, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02764320932328701, + "learning_rate": 5.400338751381982e-07, + "loss": -0.0253, + "num_tokens": 229234340.0, + "reward": 4.238015651702881, + "reward_std": 1.7697044610977173, + "rewards/accuracy_reward/mean": 3.488015651702881, + "rewards/accuracy_reward/std": 3.6470911502838135, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 628.25, + "completions/mean_terminated_length": 605.7142944335938, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.8181268882175227, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02734432928264141, + "learning_rate": 5.385004217659617e-07, + "loss": -0.0377, + "num_tokens": 229357524.0, + "reward": 7.708415508270264, + "reward_std": 1.685173749923706, + "rewards/accuracy_reward/mean": 6.970134735107422, + "rewards/accuracy_reward/std": 1.8822942972183228, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 518.6875, + "completions/mean_terminated_length": 494.4127197265625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.8187311178247734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.042565274983644485, + "learning_rate": 5.369714075846062e-07, + "loss": -0.0498, + "num_tokens": 229552512.0, + "reward": 6.4112701416015625, + "reward_std": 1.8849507570266724, + "rewards/accuracy_reward/mean": 5.672989368438721, + "rewards/accuracy_reward/std": 3.17318058013916, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 533.90625, + "completions/mean_terminated_length": 533.90625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.8193353474320242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035596564412117004, + "learning_rate": 5.354468387008236e-07, + "loss": 0.0245, + "num_tokens": 229746618.0, + "reward": 5.039579391479492, + "reward_std": 1.3946702480316162, + "rewards/accuracy_reward/mean": 4.289579391479492, + "rewards/accuracy_reward/std": 3.6664066314697266, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 606.953125, + "completions/mean_terminated_length": 606.953125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.8199395770392749, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05152088403701782, + "learning_rate": 5.339267212035526e-07, + "loss": 0.0026, + "num_tokens": 229930823.0, + "reward": 4.511541843414307, + "reward_std": 1.7331007719039917, + "rewards/accuracy_reward/mean": 3.761542320251465, + "rewards/accuracy_reward/std": 3.670295238494873, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1092.0, + "completions/mean_length": 696.609375, + "completions/mean_terminated_length": 653.01611328125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.8205438066465257, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04540547728538513, + "learning_rate": 5.324110611639532e-07, + "loss": -0.0267, + "num_tokens": 230112846.0, + "reward": 1.4956609010696411, + "reward_std": 1.9958090782165527, + "rewards/accuracy_reward/mean": 0.7730047106742859, + "rewards/accuracy_reward/std": 2.3800697326660156, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.13449780642986298, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 617.109375, + "completions/mean_terminated_length": 617.109375, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "epoch": 0.8211480362537764, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05400915443897247, + "learning_rate": 5.308998646353822e-07, + "loss": 0.0176, + "num_tokens": 230287477.0, + "reward": 5.042145252227783, + "reward_std": 2.7689075469970703, + "rewards/accuracy_reward/mean": 4.292145252227783, + "rewards/accuracy_reward/std": 3.7503769397735596, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 521.875, + "completions/mean_terminated_length": 521.875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.8217522658610272, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02438397891819477, + "learning_rate": 5.293931376533711e-07, + "loss": 0.0145, + "num_tokens": 230476077.0, + "reward": 6.003777980804443, + "reward_std": 1.093226671218872, + "rewards/accuracy_reward/mean": 5.253777980804443, + "rewards/accuracy_reward/std": 3.425689220428467, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1061.0, + "completions/max_terminated_length": 1061.0, + "completions/mean_length": 628.09375, + "completions/mean_terminated_length": 628.09375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.8223564954682779, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.052193619310855865, + "learning_rate": 5.278908862355995e-07, + "loss": -0.0115, + "num_tokens": 230723827.0, + "reward": 6.520179748535156, + "reward_std": 2.3348870277404785, + "rewards/accuracy_reward/mean": 5.770179748535156, + "rewards/accuracy_reward/std": 3.1881043910980225, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 624.359375, + "completions/mean_terminated_length": 624.359375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.8229607250755288, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.060377806425094604, + "learning_rate": 5.26393116381872e-07, + "loss": 0.0321, + "num_tokens": 230948426.0, + "reward": 3.863926410675049, + "reward_std": 2.3636608123779297, + "rewards/accuracy_reward/mean": 3.113926410675049, + "rewards/accuracy_reward/std": 3.741914749145508, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 608.09375, + "completions/mean_terminated_length": 608.09375, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.8235649546827795, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.039137061685323715, + "learning_rate": 5.248998340740957e-07, + "loss": -0.0095, + "num_tokens": 231101280.0, + "reward": 3.706202983856201, + "reward_std": 1.710796594619751, + "rewards/accuracy_reward/mean": 2.956202983856201, + "rewards/accuracy_reward/std": 3.6512889862060547, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1065.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 528.796875, + "completions/mean_terminated_length": 528.796875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.8241691842900302, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03734326362609863, + "learning_rate": 5.234110452762535e-07, + "loss": -0.0159, + "num_tokens": 231253267.0, + "reward": 5.084203243255615, + "reward_std": 1.0736148357391357, + "rewards/accuracy_reward/mean": 4.334203243255615, + "rewards/accuracy_reward/std": 3.785565137863159, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 612.78125, + "completions/mean_terminated_length": 612.78125, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "epoch": 0.824773413897281, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0473204143345356, + "learning_rate": 5.219267559343825e-07, + "loss": -0.0082, + "num_tokens": 231422181.0, + "reward": 4.248176574707031, + "reward_std": 1.9059895277023315, + "rewards/accuracy_reward/mean": 3.4981765747070312, + "rewards/accuracy_reward/std": 3.7536044120788574, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 563.5625, + "completions/mean_terminated_length": 563.5625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.8253776435045317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.033119942992925644, + "learning_rate": 5.204469719765495e-07, + "loss": -0.0058, + "num_tokens": 231596009.0, + "reward": 3.3145813941955566, + "reward_std": 1.496830701828003, + "rewards/accuracy_reward/mean": 2.5645811557769775, + "rewards/accuracy_reward/std": 3.6254661083221436, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 496.953125, + "completions/mean_terminated_length": 496.953125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.8259818731117825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04663668945431709, + "learning_rate": 5.189716993128281e-07, + "loss": 0.0139, + "num_tokens": 231730294.0, + "reward": 5.498456001281738, + "reward_std": 2.237534999847412, + "rewards/accuracy_reward/mean": 4.748456001281738, + "rewards/accuracy_reward/std": 3.5849947929382324, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 528.484375, + "completions/mean_terminated_length": 528.484375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.8265861027190332, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04454357177019119, + "learning_rate": 5.175009438352725e-07, + "loss": 0.0135, + "num_tokens": 231883477.0, + "reward": 5.678822040557861, + "reward_std": 1.6990375518798828, + "rewards/accuracy_reward/mean": 4.928821563720703, + "rewards/accuracy_reward/std": 3.505466938018799, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 700.0, + "completions/max_terminated_length": 700.0, + "completions/mean_length": 528.75, + "completions/mean_terminated_length": 528.75, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.827190332326284, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.049093544483184814, + "learning_rate": 5.160347114178972e-07, + "loss": 0.0242, + "num_tokens": 232132645.0, + "reward": 5.159590244293213, + "reward_std": 2.1714377403259277, + "rewards/accuracy_reward/mean": 4.409590721130371, + "rewards/accuracy_reward/std": 3.710524797439575, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 500.84375, + "completions/mean_terminated_length": 500.84375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.8277945619335347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03168391063809395, + "learning_rate": 5.145730079166522e-07, + "loss": -0.0247, + "num_tokens": 232280283.0, + "reward": 3.263368606567383, + "reward_std": 1.4908045530319214, + "rewards/accuracy_reward/mean": 2.513368606567383, + "rewards/accuracy_reward/std": 3.512648344039917, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1297.0, + "completions/max_terminated_length": 1297.0, + "completions/mean_length": 706.65625, + "completions/mean_terminated_length": 706.65625, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "epoch": 0.8283987915407856, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.049412503838539124, + "learning_rate": 5.13115839169399e-07, + "loss": 0.0023, + "num_tokens": 232452213.0, + "reward": 1.7567640542984009, + "reward_std": 1.724145770072937, + "rewards/accuracy_reward/mean": 1.0067640542984009, + "rewards/accuracy_reward/std": 2.44024395942688, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 433.421875, + "completions/mean_terminated_length": 433.421875, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.8290030211480363, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04256247356534004, + "learning_rate": 5.116632109958881e-07, + "loss": 0.0072, + "num_tokens": 232649616.0, + "reward": 5.2591142654418945, + "reward_std": 1.712854266166687, + "rewards/accuracy_reward/mean": 4.5091142654418945, + "rewards/accuracy_reward/std": 3.6606080532073975, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 531.296875, + "completions/mean_terminated_length": 531.296875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.829607250755287, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03335312753915787, + "learning_rate": 5.102151291977354e-07, + "loss": -0.0173, + "num_tokens": 232796579.0, + "reward": 1.4651046991348267, + "reward_std": 1.4557844400405884, + "rewards/accuracy_reward/mean": 0.7151046991348267, + "rewards/accuracy_reward/std": 2.174787998199463, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 778.0, + "completions/max_terminated_length": 778.0, + "completions/mean_length": 506.46875, + "completions/mean_terminated_length": 506.46875, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.8302114803625378, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024066757410764694, + "learning_rate": 5.087715995583995e-07, + "loss": 0.0006, + "num_tokens": 232940209.0, + "reward": 4.0053391456604, + "reward_std": 0.8672870993614197, + "rewards/accuracy_reward/mean": 3.2553391456604004, + "rewards/accuracy_reward/std": 3.6911849975585938, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 514.234375, + "completions/mean_terminated_length": 514.234375, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.8308157099697885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03374012932181358, + "learning_rate": 5.073326278431579e-07, + "loss": 0.0191, + "num_tokens": 233069312.0, + "reward": 5.799511432647705, + "reward_std": 1.6557495594024658, + "rewards/accuracy_reward/mean": 5.049511432647705, + "rewards/accuracy_reward/std": 3.4419825077056885, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.0, + "completions/max_terminated_length": 1499.0, + "completions/mean_length": 705.71875, + "completions/mean_terminated_length": 705.71875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "epoch": 0.8314199395770393, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0010374907869845629, + "learning_rate": 5.05898219799084e-07, + "loss": -0.0002, + "num_tokens": 233238430.0, + "reward": 2.6177093982696533, + "reward_std": 0.03401027247309685, + "rewards/accuracy_reward/mean": 1.8677092790603638, + "rewards/accuracy_reward/std": 3.2443926334381104, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 486.421875, + "completions/mean_terminated_length": 486.421875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.83202416918429, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.023809365928173065, + "learning_rate": 5.044683811550256e-07, + "loss": 0.0023, + "num_tokens": 233456457.0, + "reward": 2.256682872772217, + "reward_std": 0.7589640021324158, + "rewards/accuracy_reward/mean": 1.5066828727722168, + "rewards/accuracy_reward/std": 3.1517598628997803, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1070.0, + "completions/max_terminated_length": 1070.0, + "completions/mean_length": 681.328125, + "completions/mean_terminated_length": 681.328125, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.8326283987915408, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046291906386613846, + "learning_rate": 5.030431176215797e-07, + "loss": 0.0103, + "num_tokens": 233661902.0, + "reward": 1.683117151260376, + "reward_std": 1.7502387762069702, + "rewards/accuracy_reward/mean": 0.9331172108650208, + "rewards/accuracy_reward/std": 2.496814250946045, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 774.0, + "completions/max_terminated_length": 774.0, + "completions/mean_length": 549.171875, + "completions/mean_terminated_length": 549.171875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.8332326283987915, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.021980129182338715, + "learning_rate": 5.016224348910712e-07, + "loss": 0.0063, + "num_tokens": 233830329.0, + "reward": 2.4977827072143555, + "reward_std": 0.4661034941673279, + "rewards/accuracy_reward/mean": 1.7477827072143555, + "rewards/accuracy_reward/std": 3.1839170455932617, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 498.5, + "completions/mean_terminated_length": 498.5, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.8338368580060423, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020948871970176697, + "learning_rate": 5.002063386375302e-07, + "loss": -0.0009, + "num_tokens": 233999769.0, + "reward": 6.04162073135376, + "reward_std": 0.6618981957435608, + "rewards/accuracy_reward/mean": 5.291621208190918, + "rewards/accuracy_reward/std": 3.337209463119507, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 518.140625, + "completions/mean_terminated_length": 518.140625, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.834441087613293, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03600389137864113, + "learning_rate": 4.987948345166689e-07, + "loss": 0.0145, + "num_tokens": 234180850.0, + "reward": 3.9413204193115234, + "reward_std": 1.838616967201233, + "rewards/accuracy_reward/mean": 3.1913204193115234, + "rewards/accuracy_reward/std": 3.715160369873047, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 452.671875, + "completions/mean_terminated_length": 452.671875, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.8350453172205438, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0005201937165111303, + "learning_rate": 4.97387928165859e-07, + "loss": -0.0003, + "num_tokens": 234320925.0, + "reward": 8.176128387451172, + "reward_std": 0.03315550833940506, + "rewards/accuracy_reward/mean": 7.426128387451172, + "rewards/accuracy_reward/std": 0.04228882119059563, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 650.03125, + "completions/mean_terminated_length": 650.03125, + "completions/min_length": 460.0, + "completions/min_terminated_length": 460.0, + "epoch": 0.8356495468277946, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033959612250328064, + "learning_rate": 4.959856252041087e-07, + "loss": 0.0025, + "num_tokens": 234528927.0, + "reward": 4.239409446716309, + "reward_std": 0.7598655819892883, + "rewards/accuracy_reward/mean": 3.4894092082977295, + "rewards/accuracy_reward/std": 3.630068302154541, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 901.0, + "completions/max_terminated_length": 901.0, + "completions/mean_length": 517.5625, + "completions/mean_terminated_length": 517.5625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.8362537764350453, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.035795629024505615, + "learning_rate": 4.945879312320422e-07, + "loss": -0.0071, + "num_tokens": 234713987.0, + "reward": 5.401435852050781, + "reward_std": 1.4261027574539185, + "rewards/accuracy_reward/mean": 4.651435852050781, + "rewards/accuracy_reward/std": 3.631568193435669, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 579.625, + "completions/mean_terminated_length": 579.625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.8368580060422961, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03389376774430275, + "learning_rate": 4.931948518318745e-07, + "loss": -0.0041, + "num_tokens": 234854875.0, + "reward": 5.42031717300415, + "reward_std": 1.9808275699615479, + "rewards/accuracy_reward/mean": 4.67031717300415, + "rewards/accuracy_reward/std": 3.4498538970947266, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1450.0, + "completions/max_terminated_length": 1450.0, + "completions/mean_length": 610.25, + "completions/mean_terminated_length": 610.25, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "epoch": 0.8374622356495468, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06179703399538994, + "learning_rate": 4.918063925673913e-07, + "loss": 0.0282, + "num_tokens": 235092347.0, + "reward": 3.691751480102539, + "reward_std": 2.74216890335083, + "rewards/accuracy_reward/mean": 2.941751480102539, + "rewards/accuracy_reward/std": 3.7541019916534424, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 517.515625, + "completions/mean_terminated_length": 517.515625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.8380664652567976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03601165860891342, + "learning_rate": 4.904225589839263e-07, + "loss": 0.0047, + "num_tokens": 235268476.0, + "reward": 3.8331170082092285, + "reward_std": 1.7332665920257568, + "rewards/accuracy_reward/mean": 3.0870234966278076, + "rewards/accuracy_reward/std": 3.6123876571655273, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 437.359375, + "completions/mean_terminated_length": 437.359375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.8386706948640483, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0003691546153277159, + "learning_rate": 4.890433566083384e-07, + "loss": -0.0003, + "num_tokens": 235447699.0, + "reward": 6.311673164367676, + "reward_std": 0.021148502826690674, + "rewards/accuracy_reward/mean": 5.561673164367676, + "rewards/accuracy_reward/std": 3.23677134513855, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 558.921875, + "completions/mean_terminated_length": 558.921875, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.8392749244712991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03833060339093208, + "learning_rate": 4.876687909489894e-07, + "loss": 0.0051, + "num_tokens": 235693470.0, + "reward": 4.907212734222412, + "reward_std": 1.2400517463684082, + "rewards/accuracy_reward/mean": 4.161118984222412, + "rewards/accuracy_reward/std": 3.6989359855651855, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 578.453125, + "completions/mean_terminated_length": 578.453125, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.8398791540785498, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.020027387887239456, + "learning_rate": 4.862988674957244e-07, + "loss": 0.0127, + "num_tokens": 235936027.0, + "reward": 6.2153730392456055, + "reward_std": 0.48091185092926025, + "rewards/accuracy_reward/mean": 5.465373516082764, + "rewards/accuracy_reward/std": 3.31309175491333, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 533.765625, + "completions/mean_terminated_length": 533.765625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.8404833836858006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03996667638421059, + "learning_rate": 4.849335917198466e-07, + "loss": 0.017, + "num_tokens": 236144700.0, + "reward": 5.3349809646606445, + "reward_std": 2.1414942741394043, + "rewards/accuracy_reward/mean": 4.5849809646606445, + "rewards/accuracy_reward/std": 3.5961263179779053, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 481.515625, + "completions/mean_terminated_length": 481.515625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.8410876132930514, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04300526902079582, + "learning_rate": 4.835729690740971e-07, + "loss": 0.0317, + "num_tokens": 236322989.0, + "reward": 3.884115695953369, + "reward_std": 0.8891419768333435, + "rewards/accuracy_reward/mean": 3.134115695953369, + "rewards/accuracy_reward/std": 3.698064088821411, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1995.0, + "completions/mean_length": 624.0, + "completions/mean_terminated_length": 578.0645141601562, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.8416918429003021, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03920426592230797, + "learning_rate": 4.822170049926334e-07, + "loss": -0.0119, + "num_tokens": 236499677.0, + "reward": 3.5929250717163086, + "reward_std": 1.591423749923706, + "rewards/accuracy_reward/mean": 2.8663625717163086, + "rewards/accuracy_reward/std": 3.7111971378326416, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 584.71875, + "completions/mean_terminated_length": 584.71875, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "epoch": 0.8422960725075529, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03530830517411232, + "learning_rate": 4.808657048910077e-07, + "loss": 0.0117, + "num_tokens": 236685835.0, + "reward": 5.4299421310424805, + "reward_std": 1.0933727025985718, + "rewards/accuracy_reward/mean": 4.6799421310424805, + "rewards/accuracy_reward/std": 3.6717963218688965, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 899.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 572.1875, + "completions/mean_terminated_length": 572.1875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.8429003021148036, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04655952751636505, + "learning_rate": 4.795190741661442e-07, + "loss": -0.0148, + "num_tokens": 236861367.0, + "reward": 5.263673305511475, + "reward_std": 2.1303787231445312, + "rewards/accuracy_reward/mean": 4.513673305511475, + "rewards/accuracy_reward/std": 3.6272928714752197, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 588.1875, + "completions/mean_terminated_length": 588.1875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.8435045317220544, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02996179834008217, + "learning_rate": 4.781771181963174e-07, + "loss": 0.0028, + "num_tokens": 237034259.0, + "reward": 3.626260995864868, + "reward_std": 1.0596314668655396, + "rewards/accuracy_reward/mean": 2.880167245864868, + "rewards/accuracy_reward/std": 3.6944491863250732, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 596.28125, + "completions/mean_terminated_length": 596.28125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.8441087613293051, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039155568927526474, + "learning_rate": 4.768398423411333e-07, + "loss": 0.0024, + "num_tokens": 237179909.0, + "reward": 6.092103004455566, + "reward_std": 0.7719809412956238, + "rewards/accuracy_reward/mean": 5.353821754455566, + "rewards/accuracy_reward/std": 3.381878137588501, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 424.875, + "completions/mean_terminated_length": 424.875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "epoch": 0.8447129909365559, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0037890924140810966, + "learning_rate": 4.755072519415049e-07, + "loss": -0.0002, + "num_tokens": 237328829.0, + "reward": 4.383968830108643, + "reward_std": 0.12597469985485077, + "rewards/accuracy_reward/mean": 3.6339688301086426, + "rewards/accuracy_reward/std": 3.7301101684570312, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 588.84375, + "completions/mean_terminated_length": 588.84375, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.8453172205438066, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.031241364777088165, + "learning_rate": 4.74179352319632e-07, + "loss": -0.0058, + "num_tokens": 237515907.0, + "reward": 4.340234279632568, + "reward_std": 1.5359389781951904, + "rewards/accuracy_reward/mean": 3.5902342796325684, + "rewards/accuracy_reward/std": 3.646275520324707, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 467.140625, + "completions/mean_terminated_length": 467.140625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.8459214501510574, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.020141204819083214, + "learning_rate": 4.728561487789802e-07, + "loss": 0.0075, + "num_tokens": 237648460.0, + "reward": 4.128215789794922, + "reward_std": 0.7593331933021545, + "rewards/accuracy_reward/mean": 3.378215789794922, + "rewards/accuracy_reward/std": 3.7406563758850098, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 485.53125, + "completions/mean_terminated_length": 485.53125, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.8465256797583082, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03546593338251114, + "learning_rate": 4.7153764660426e-07, + "loss": 0.0168, + "num_tokens": 237798142.0, + "reward": 3.5249156951904297, + "reward_std": 0.9695370197296143, + "rewards/accuracy_reward/mean": 2.7749156951904297, + "rewards/accuracy_reward/std": 3.6108174324035645, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 736.0625, + "completions/mean_terminated_length": 715.2381591796875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.8471299093655589, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06353239715099335, + "learning_rate": 4.7022385106140494e-07, + "loss": 0.0537, + "num_tokens": 237949058.0, + "reward": 4.677389144897461, + "reward_std": 3.6389288902282715, + "rewards/accuracy_reward/mean": 3.943014144897461, + "rewards/accuracy_reward/std": 3.7711853981018066, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 419.140625, + "completions/mean_terminated_length": 419.140625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.8477341389728097, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03897814452648163, + "learning_rate": 4.689147673975502e-07, + "loss": 0.018, + "num_tokens": 238100027.0, + "reward": 5.254084587097168, + "reward_std": 2.2007951736450195, + "rewards/accuracy_reward/mean": 4.504084587097168, + "rewards/accuracy_reward/std": 3.6408262252807617, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 694.75, + "completions/mean_terminated_length": 694.75, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.8483383685800604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0434582494199276, + "learning_rate": 4.6761040084101373e-07, + "loss": 0.0185, + "num_tokens": 238290139.0, + "reward": 5.854343414306641, + "reward_std": 2.2038872241973877, + "rewards/accuracy_reward/mean": 5.104344367980957, + "rewards/accuracy_reward/std": 3.391059160232544, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 693.0, + "completions/max_terminated_length": 693.0, + "completions/mean_length": 489.0625, + "completions/mean_terminated_length": 489.0625, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.8489425981873112, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027834320440888405, + "learning_rate": 4.6631075660127247e-07, + "loss": -0.0043, + "num_tokens": 238447279.0, + "reward": 7.249739170074463, + "reward_std": 0.9878553748130798, + "rewards/accuracy_reward/mean": 6.499739170074463, + "rewards/accuracy_reward/std": 2.4763450622558594, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1297.0, + "completions/max_terminated_length": 1297.0, + "completions/mean_length": 632.890625, + "completions/mean_terminated_length": 632.890625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.8495468277945619, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029280083253979683, + "learning_rate": 4.650158398689436e-07, + "loss": -0.0164, + "num_tokens": 238625608.0, + "reward": 5.625653266906738, + "reward_std": 0.9668205976486206, + "rewards/accuracy_reward/mean": 4.883465766906738, + "rewards/accuracy_reward/std": 3.480330228805542, + "rewards/tag_count_reward/mean": 0.7421875, + "rewards/tag_count_reward/std": 0.0625, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 539.96875, + "completions/mean_terminated_length": 516.0317993164062, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.8501510574018127, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.058303531259298325, + "learning_rate": 4.637256558157636e-07, + "loss": -0.0797, + "num_tokens": 238768054.0, + "reward": 4.604510307312012, + "reward_std": 1.5514850616455078, + "rewards/accuracy_reward/mean": 3.8662285804748535, + "rewards/accuracy_reward/std": 3.728281259536743, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1740.0, + "completions/mean_length": 752.390625, + "completions/mean_terminated_length": 731.825439453125, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "epoch": 0.8507552870090634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02252233773469925, + "learning_rate": 4.6244020959456686e-07, + "loss": -0.0061, + "num_tokens": 238964655.0, + "reward": 4.22682523727417, + "reward_std": 0.8584127426147461, + "rewards/accuracy_reward/mean": 3.488543748855591, + "rewards/accuracy_reward/std": 3.6526284217834473, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 591.65625, + "completions/mean_terminated_length": 591.65625, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "epoch": 0.8513595166163141, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0020136015955358744, + "learning_rate": 4.6115950633926564e-07, + "loss": -0.0011, + "num_tokens": 239123801.0, + "reward": 2.593059539794922, + "reward_std": 0.07073704153299332, + "rewards/accuracy_reward/mean": 1.8430594205856323, + "rewards/accuracy_reward/std": 3.256304979324341, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 519.546875, + "completions/mean_terminated_length": 519.546875, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.851963746223565, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06150268018245697, + "learning_rate": 4.598835511648287e-07, + "loss": 0.0015, + "num_tokens": 239285340.0, + "reward": 5.804296970367432, + "reward_std": 2.6833224296569824, + "rewards/accuracy_reward/mean": 5.054296493530273, + "rewards/accuracy_reward/std": 3.452594518661499, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 555.578125, + "completions/mean_terminated_length": 531.888916015625, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.8525679758308157, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.014154798351228237, + "learning_rate": 4.586123491672626e-07, + "loss": -0.0085, + "num_tokens": 239429505.0, + "reward": 2.445054531097412, + "reward_std": 0.5858139395713806, + "rewards/accuracy_reward/mean": 1.7067734003067017, + "rewards/accuracy_reward/std": 3.1883504390716553, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 619.9375, + "completions/mean_terminated_length": 619.9375, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.8531722054380665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05862529203295708, + "learning_rate": 4.573459054235896e-07, + "loss": -0.0274, + "num_tokens": 239591965.0, + "reward": 6.2617998123168945, + "reward_std": 2.5715959072113037, + "rewards/accuracy_reward/mean": 5.5117998123168945, + "rewards/accuracy_reward/std": 3.2328736782073975, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 451.796875, + "completions/mean_terminated_length": 451.796875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8537764350453172, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03774956241250038, + "learning_rate": 4.560842249918279e-07, + "loss": 0.0048, + "num_tokens": 239716016.0, + "reward": 3.543534278869629, + "reward_std": 0.9720061421394348, + "rewards/accuracy_reward/mean": 2.793534517288208, + "rewards/accuracy_reward/std": 3.635004758834839, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 546.953125, + "completions/mean_terminated_length": 546.953125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.854380664652568, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03361612185835838, + "learning_rate": 4.548273129109728e-07, + "loss": 0.009, + "num_tokens": 239887901.0, + "reward": 1.94023597240448, + "reward_std": 1.1368889808654785, + "rewards/accuracy_reward/mean": 1.19023597240448, + "rewards/accuracy_reward/std": 2.71793532371521, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 398.515625, + "completions/mean_terminated_length": 398.515625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.8549848942598187, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0402086079120636, + "learning_rate": 4.5357517420097427e-07, + "loss": 0.0075, + "num_tokens": 240043118.0, + "reward": 7.498831748962402, + "reward_std": 1.544876217842102, + "rewards/accuracy_reward/mean": 6.748831748962402, + "rewards/accuracy_reward/std": 2.2181153297424316, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 513.84375, + "completions/mean_terminated_length": 513.84375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.8555891238670695, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0418655127286911, + "learning_rate": 4.523278138627179e-07, + "loss": -0.0069, + "num_tokens": 240241668.0, + "reward": 2.5962672233581543, + "reward_std": 2.0352330207824707, + "rewards/accuracy_reward/mean": 1.8462671041488647, + "rewards/accuracy_reward/std": 3.1123623847961426, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1934.0, + "completions/max_terminated_length": 1934.0, + "completions/mean_length": 679.4375, + "completions/mean_terminated_length": 679.4375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.8561933534743202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047026827931404114, + "learning_rate": 4.5108523687800616e-07, + "loss": 0.0134, + "num_tokens": 240425840.0, + "reward": 5.535543441772461, + "reward_std": 1.5818185806274414, + "rewards/accuracy_reward/mean": 4.785543441772461, + "rewards/accuracy_reward/std": 3.558215618133545, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1570.0, + "completions/mean_length": 798.140625, + "completions/mean_terminated_length": 668.8448486328125, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 0.856797583081571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.045719511806964874, + "learning_rate": 4.498474482095365e-07, + "loss": -0.0446, + "num_tokens": 240587321.0, + "reward": 1.6690969467163086, + "reward_std": 2.220553398132324, + "rewards/accuracy_reward/mean": 0.9894093871116638, + "rewards/accuracy_reward/std": 2.4906392097473145, + "rewards/tag_count_reward/mean": 0.6796875, + "rewards/tag_count_reward/std": 0.2203386276960373, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 933.0, + "completions/max_terminated_length": 933.0, + "completions/mean_length": 529.390625, + "completions/mean_terminated_length": 529.390625, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.8574018126888218, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04033606871962547, + "learning_rate": 4.4861445280088246e-07, + "loss": -0.008, + "num_tokens": 240714914.0, + "reward": 3.079773426055908, + "reward_std": 1.9860963821411133, + "rewards/accuracy_reward/mean": 2.329773426055908, + "rewards/accuracy_reward/std": 3.475165367126465, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 413.046875, + "completions/mean_terminated_length": 413.046875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.8580060422960725, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.035326939076185226, + "learning_rate": 4.473862555764745e-07, + "loss": -0.0211, + "num_tokens": 240873509.0, + "reward": 3.523961067199707, + "reward_std": 0.9683634638786316, + "rewards/accuracy_reward/mean": 2.773961067199707, + "rewards/accuracy_reward/std": 3.609565019607544, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 527.1875, + "completions/mean_terminated_length": 527.1875, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.8586102719033233, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0511360764503479, + "learning_rate": 4.461628614415793e-07, + "loss": 0.0137, + "num_tokens": 241000113.0, + "reward": 4.5797343254089355, + "reward_std": 2.3907010555267334, + "rewards/accuracy_reward/mean": 3.8336408138275146, + "rewards/accuracy_reward/std": 3.7249279022216797, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1906.0, + "completions/max_terminated_length": 1906.0, + "completions/mean_length": 576.5625, + "completions/mean_terminated_length": 576.5625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.859214501510574, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0019897017627954483, + "learning_rate": 4.4494427528228083e-07, + "loss": -0.0002, + "num_tokens": 241144133.0, + "reward": 6.353646278381348, + "reward_std": 0.07430576533079147, + "rewards/accuracy_reward/mean": 5.603646278381348, + "rewards/accuracy_reward/std": 3.194432497024536, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 708.1875, + "completions/mean_terminated_length": 708.1875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.8598187311178248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.005506359972059727, + "learning_rate": 4.4373050196545983e-07, + "loss": -0.0061, + "num_tokens": 241285233.0, + "reward": 2.661262273788452, + "reward_std": 0.21086767315864563, + "rewards/accuracy_reward/mean": 1.9229812622070312, + "rewards/accuracy_reward/std": 3.2116639614105225, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 516.15625, + "completions/mean_terminated_length": 516.15625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.8604229607250755, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0003106700023636222, + "learning_rate": 4.425215463387764e-07, + "loss": -0.0001, + "num_tokens": 241499723.0, + "reward": 4.470515251159668, + "reward_std": 0.01433244813233614, + "rewards/accuracy_reward/mean": 3.720515251159668, + "rewards/accuracy_reward/std": 3.7500176429748535, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1177.0, + "completions/max_terminated_length": 1177.0, + "completions/mean_length": 644.484375, + "completions/mean_terminated_length": 644.484375, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.8610271903323263, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.013715093024075031, + "learning_rate": 4.4131741323064863e-07, + "loss": -0.0003, + "num_tokens": 241674954.0, + "reward": 0.825348436832428, + "reward_std": 0.5557969808578491, + "rewards/accuracy_reward/mean": 0.07534843683242798, + "rewards/accuracy_reward/std": 0.9566243886947632, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 599.84375, + "completions/mean_terminated_length": 599.84375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.861631419939577, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039317190647125244, + "learning_rate": 4.4011810745023365e-07, + "loss": 0.0313, + "num_tokens": 241899568.0, + "reward": 5.734759330749512, + "reward_std": 1.442224144935608, + "rewards/accuracy_reward/mean": 4.996478080749512, + "rewards/accuracy_reward/std": 3.538800001144409, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 498.734375, + "completions/mean_terminated_length": 498.734375, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.8622356495468277, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06316912919282913, + "learning_rate": 4.3892363378741015e-07, + "loss": 0.0046, + "num_tokens": 242075663.0, + "reward": 6.510375022888184, + "reward_std": 2.709207534790039, + "rewards/accuracy_reward/mean": 5.760375022888184, + "rewards/accuracy_reward/std": 3.2317235469818115, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 562.1875, + "completions/mean_terminated_length": 538.6032104492188, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.8628398791540786, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02995225414633751, + "learning_rate": 4.377339970127567e-07, + "loss": 0.0281, + "num_tokens": 242209099.0, + "reward": 3.1478281021118164, + "reward_std": 0.9274076223373413, + "rewards/accuracy_reward/mean": 2.4095468521118164, + "rewards/accuracy_reward/std": 3.5466601848602295, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 670.28125, + "completions/mean_terminated_length": 625.8386840820312, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.8634441087613293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.041942108422517776, + "learning_rate": 4.365492018775346e-07, + "loss": -0.0785, + "num_tokens": 242374141.0, + "reward": 7.203215599060059, + "reward_std": 1.0208157300949097, + "rewards/accuracy_reward/mean": 6.476653099060059, + "rewards/accuracy_reward/std": 2.5094199180603027, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 536.15625, + "completions/mean_terminated_length": 536.15625, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.8640483383685801, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046198584139347076, + "learning_rate": 4.353692531136677e-07, + "loss": 0.0026, + "num_tokens": 242554135.0, + "reward": 7.151562690734863, + "reward_std": 1.4423164129257202, + "rewards/accuracy_reward/mean": 6.401562690734863, + "rewards/accuracy_reward/std": 2.6102077960968018, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 871.0, + "completions/max_terminated_length": 871.0, + "completions/mean_length": 581.515625, + "completions/mean_terminated_length": 581.515625, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "epoch": 0.8646525679758308, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02069784142076969, + "learning_rate": 4.341941554337248e-07, + "loss": -0.0063, + "num_tokens": 242710872.0, + "reward": 6.066758155822754, + "reward_std": 0.6885491609573364, + "rewards/accuracy_reward/mean": 5.316758155822754, + "rewards/accuracy_reward/std": 3.389599084854126, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 576.109375, + "completions/mean_terminated_length": 576.109375, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "epoch": 0.8652567975830816, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06616289168596268, + "learning_rate": 4.330239135308996e-07, + "loss": 0.0, + "num_tokens": 242900879.0, + "reward": 5.846131324768066, + "reward_std": 1.8296680450439453, + "rewards/accuracy_reward/mean": 5.096131324768066, + "rewards/accuracy_reward/std": 3.535043954849243, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 644.265625, + "completions/mean_terminated_length": 644.265625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "epoch": 0.8658610271903323, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03713757544755936, + "learning_rate": 4.31858532078992e-07, + "loss": 0.007, + "num_tokens": 243073328.0, + "reward": 3.3720877170562744, + "reward_std": 1.435239553451538, + "rewards/accuracy_reward/mean": 2.6220874786376953, + "rewards/accuracy_reward/std": 3.44815993309021, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 936.0, + "completions/max_terminated_length": 936.0, + "completions/mean_length": 578.390625, + "completions/mean_terminated_length": 578.390625, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.8664652567975831, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04699649289250374, + "learning_rate": 4.3069801573239134e-07, + "loss": 0.008, + "num_tokens": 243310569.0, + "reward": 3.6059060096740723, + "reward_std": 2.689938545227051, + "rewards/accuracy_reward/mean": 2.8559062480926514, + "rewards/accuracy_reward/std": 3.663954734802246, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1752.0, + "completions/mean_length": 672.6875, + "completions/mean_terminated_length": 650.857177734375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.8670694864048338, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.031757205724716187, + "learning_rate": 4.295423691260548e-07, + "loss": -0.0195, + "num_tokens": 243472293.0, + "reward": 3.536357879638672, + "reward_std": 1.671194076538086, + "rewards/accuracy_reward/mean": 2.798076629638672, + "rewards/accuracy_reward/std": 3.5568602085113525, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 896.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 548.34375, + "completions/mean_terminated_length": 548.34375, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.8676737160120845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.00043923858902417123, + "learning_rate": 4.28391596875491e-07, + "loss": -0.0003, + "num_tokens": 243627835.0, + "reward": 8.13845157623291, + "reward_std": 0.02803395316004753, + "rewards/accuracy_reward/mean": 7.38845157623291, + "rewards/accuracy_reward/std": 0.06958401948213577, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 506.484375, + "completions/mean_terminated_length": 506.484375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.8682779456193354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04001127555966377, + "learning_rate": 4.2724570357674076e-07, + "loss": -0.0196, + "num_tokens": 243771642.0, + "reward": 5.90208625793457, + "reward_std": 2.5299088954925537, + "rewards/accuracy_reward/mean": 5.15208625793457, + "rewards/accuracy_reward/std": 3.7115044593811035, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 600.28125, + "completions/mean_terminated_length": 600.28125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.8688821752265861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03886450082063675, + "learning_rate": 4.261046938063597e-07, + "loss": 0.0228, + "num_tokens": 243938188.0, + "reward": 4.722962379455566, + "reward_std": 1.466734766960144, + "rewards/accuracy_reward/mean": 3.9729626178741455, + "rewards/accuracy_reward/std": 3.749521255493164, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 989.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 617.84375, + "completions/mean_terminated_length": 617.84375, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "epoch": 0.8694864048338369, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0388813242316246, + "learning_rate": 4.2496857212139875e-07, + "loss": 0.0011, + "num_tokens": 244087538.0, + "reward": 3.6759531497955322, + "reward_std": 1.368376612663269, + "rewards/accuracy_reward/mean": 2.9259531497955322, + "rewards/accuracy_reward/std": 3.667834758758545, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 666.015625, + "completions/mean_terminated_length": 573.8833618164062, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.8700906344410876, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.006492708344012499, + "learning_rate": 4.238373430593857e-07, + "loss": -0.0222, + "num_tokens": 244239299.0, + "reward": 2.435028076171875, + "reward_std": 0.3062900900840759, + "rewards/accuracy_reward/mean": 1.731903076171875, + "rewards/accuracy_reward/std": 3.292546272277832, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.18298126757144928, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 509.578125, + "completions/mean_terminated_length": 509.578125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.8706948640483384, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01782440021634102, + "learning_rate": 4.227110111383094e-07, + "loss": -0.0077, + "num_tokens": 244400712.0, + "reward": 6.1016035079956055, + "reward_std": 0.6348812580108643, + "rewards/accuracy_reward/mean": 5.351603031158447, + "rewards/accuracy_reward/std": 3.3590800762176514, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 615.875, + "completions/mean_terminated_length": 593.1428833007812, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "epoch": 0.8712990936555891, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.028641698881983757, + "learning_rate": 4.2158958085659867e-07, + "loss": -0.007, + "num_tokens": 244557344.0, + "reward": 2.728257894515991, + "reward_std": 1.7241895198822021, + "rewards/accuracy_reward/mean": 2.001695394515991, + "rewards/accuracy_reward/std": 3.447481393814087, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 519.984375, + "completions/mean_terminated_length": 519.984375, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.8719033232628399, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03370973467826843, + "learning_rate": 4.2047305669310644e-07, + "loss": -0.0005, + "num_tokens": 244696687.0, + "reward": 6.276340484619141, + "reward_std": 2.106743812561035, + "rewards/accuracy_reward/mean": 5.526340484619141, + "rewards/accuracy_reward/std": 3.2023935317993164, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 598.1875, + "completions/mean_terminated_length": 598.1875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.8725075528700906, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05944611877202988, + "learning_rate": 4.1936144310709145e-07, + "loss": -0.0441, + "num_tokens": 244999259.0, + "reward": 2.7253000736236572, + "reward_std": 2.3969693183898926, + "rewards/accuracy_reward/mean": 1.9752999544143677, + "rewards/accuracy_reward/std": 3.1896026134490967, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 586.15625, + "completions/mean_terminated_length": 586.15625, + "completions/min_length": 409.0, + "completions/min_terminated_length": 409.0, + "epoch": 0.8731117824773413, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04170983284711838, + "learning_rate": 4.182547445381998e-07, + "loss": -0.0002, + "num_tokens": 245174341.0, + "reward": 3.863502025604248, + "reward_std": 1.9112087488174438, + "rewards/accuracy_reward/mean": 3.113502025604248, + "rewards/accuracy_reward/std": 3.7414090633392334, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 757.0, + "completions/max_terminated_length": 757.0, + "completions/mean_length": 526.40625, + "completions/mean_terminated_length": 526.40625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.8737160120845922, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04491425305604935, + "learning_rate": 4.171529654064475e-07, + "loss": 0.0042, + "num_tokens": 245306895.0, + "reward": 5.306637287139893, + "reward_std": 1.6991677284240723, + "rewards/accuracy_reward/mean": 4.556637763977051, + "rewards/accuracy_reward/std": 3.6359171867370605, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 431.78125, + "completions/mean_terminated_length": 431.78125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.8743202416918429, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02443856932222843, + "learning_rate": 4.1605611011220334e-07, + "loss": -0.006, + "num_tokens": 245533585.0, + "reward": 5.957815647125244, + "reward_std": 1.150640606880188, + "rewards/accuracy_reward/mean": 5.207815647125244, + "rewards/accuracy_reward/std": 3.4475831985473633, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1034.0, + "completions/max_terminated_length": 1034.0, + "completions/mean_length": 557.875, + "completions/mean_terminated_length": 557.875, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.8749244712990937, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03461795672774315, + "learning_rate": 4.1496418303617104e-07, + "loss": -0.0037, + "num_tokens": 245670457.0, + "reward": 5.627078056335449, + "reward_std": 1.8408150672912598, + "rewards/accuracy_reward/mean": 4.877078056335449, + "rewards/accuracy_reward/std": 3.551025629043579, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 529.28125, + "completions/mean_terminated_length": 529.28125, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.8755287009063444, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.024872832000255585, + "learning_rate": 4.138771885393712e-07, + "loss": -0.0044, + "num_tokens": 245775995.0, + "reward": 3.072573184967041, + "reward_std": 0.8426039218902588, + "rewards/accuracy_reward/mean": 2.32257342338562, + "rewards/accuracy_reward/std": 3.4722161293029785, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.0, + "completions/max_terminated_length": 713.0, + "completions/mean_length": 464.4375, + "completions/mean_terminated_length": 464.4375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.8761329305135952, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0475834384560585, + "learning_rate": 4.127951309631239e-07, + "loss": 0.0325, + "num_tokens": 245940439.0, + "reward": 5.798478603363037, + "reward_std": 1.338912010192871, + "rewards/accuracy_reward/mean": 5.048478603363037, + "rewards/accuracy_reward/std": 3.5387234687805176, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1579.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 545.875, + "completions/mean_terminated_length": 545.875, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.8767371601208459, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043091244995594025, + "learning_rate": 4.117180146290332e-07, + "loss": 0.0475, + "num_tokens": 246069583.0, + "reward": 5.663455009460449, + "reward_std": 1.420501947402954, + "rewards/accuracy_reward/mean": 4.913454532623291, + "rewards/accuracy_reward/std": 3.50992751121521, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 520.96875, + "completions/mean_terminated_length": 520.96875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.8773413897280967, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06046295166015625, + "learning_rate": 4.1064584383896707e-07, + "loss": -0.0093, + "num_tokens": 246201453.0, + "reward": 4.927865505218506, + "reward_std": 2.536203145980835, + "rewards/accuracy_reward/mean": 4.177865982055664, + "rewards/accuracy_reward/std": 3.6884849071502686, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 559.046875, + "completions/mean_terminated_length": 535.4127197265625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.8779456193353474, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03036593273282051, + "learning_rate": 4.0957862287504207e-07, + "loss": -0.0142, + "num_tokens": 246343872.0, + "reward": 1.2994797229766846, + "reward_std": 1.0570956468582153, + "rewards/accuracy_reward/mean": 0.5729171633720398, + "rewards/accuracy_reward/std": 2.0478475093841553, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 583.828125, + "completions/mean_terminated_length": 560.5873413085938, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "epoch": 0.8785498489425981, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024362489581108093, + "learning_rate": 4.085163559996061e-07, + "loss": -0.0107, + "num_tokens": 246533653.0, + "reward": 4.094364166259766, + "reward_std": 0.8698124289512634, + "rewards/accuracy_reward/mean": 3.3560829162597656, + "rewards/accuracy_reward/std": 3.7498867511749268, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 800.0, + "completions/max_terminated_length": 800.0, + "completions/mean_length": 501.015625, + "completions/mean_terminated_length": 501.015625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.879154078549849, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03206535056233406, + "learning_rate": 4.074590474552207e-07, + "loss": 0.0008, + "num_tokens": 246703206.0, + "reward": 4.020310401916504, + "reward_std": 1.203214168548584, + "rewards/accuracy_reward/mean": 3.270310878753662, + "rewards/accuracy_reward/std": 3.7234432697296143, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 742.0, + "completions/max_terminated_length": 742.0, + "completions/mean_length": 482.875, + "completions/mean_terminated_length": 482.875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.8797583081570997, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.028743600472807884, + "learning_rate": 4.064067014646441e-07, + "loss": 0.0048, + "num_tokens": 246849854.0, + "reward": 2.7783827781677246, + "reward_std": 1.6241803169250488, + "rewards/accuracy_reward/mean": 2.0283827781677246, + "rewards/accuracy_reward/std": 3.3429410457611084, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 519.84375, + "completions/mean_terminated_length": 519.84375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.8803625377643505, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034689050167798996, + "learning_rate": 4.053593222308155e-07, + "loss": 0.0125, + "num_tokens": 247008724.0, + "reward": 3.4793407917022705, + "reward_std": 1.5219560861587524, + "rewards/accuracy_reward/mean": 2.7293405532836914, + "rewards/accuracy_reward/std": 3.510695457458496, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 650.90625, + "completions/mean_terminated_length": 582.1967163085938, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.8809667673716012, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.028909485787153244, + "learning_rate": 4.043169139368373e-07, + "loss": -0.0283, + "num_tokens": 247192302.0, + "reward": 2.7335939407348633, + "reward_std": 0.9350056052207947, + "rewards/accuracy_reward/mean": 2.0187501907348633, + "rewards/accuracy_reward/std": 3.448380947113037, + "rewards/tag_count_reward/mean": 0.71484375, + "rewards/tag_count_reward/std": 0.1597815304994583, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1297.0, + "completions/max_terminated_length": 1297.0, + "completions/mean_length": 532.25, + "completions/mean_terminated_length": 532.25, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "epoch": 0.881570996978852, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.01386852003633976, + "learning_rate": 4.0327948074595816e-07, + "loss": -0.0009, + "num_tokens": 247360958.0, + "reward": 0.895007848739624, + "reward_std": 0.6666655540466309, + "rewards/accuracy_reward/mean": 0.14500781893730164, + "rewards/accuracy_reward/std": 0.9709972739219666, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1184.0, + "completions/mean_length": 656.3125, + "completions/mean_terminated_length": 634.2222290039062, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.8821752265861027, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037786055356264114, + "learning_rate": 4.022470268015564e-07, + "loss": 0.0026, + "num_tokens": 247500434.0, + "reward": 5.808547019958496, + "reward_std": 1.027167558670044, + "rewards/accuracy_reward/mean": 5.070265769958496, + "rewards/accuracy_reward/std": 3.4700751304626465, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1598.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 699.109375, + "completions/mean_terminated_length": 699.109375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.8827794561933535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04509512707591057, + "learning_rate": 4.0121955622712566e-07, + "loss": 0.0003, + "num_tokens": 247647033.0, + "reward": 3.703559398651123, + "reward_std": 2.309300661087036, + "rewards/accuracy_reward/mean": 2.953559398651123, + "rewards/accuracy_reward/std": 3.734823703765869, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 584.328125, + "completions/mean_terminated_length": 584.328125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.8833836858006042, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04949863627552986, + "learning_rate": 4.001970731262549e-07, + "loss": 0.0089, + "num_tokens": 247835934.0, + "reward": 2.551907777786255, + "reward_std": 1.7839586734771729, + "rewards/accuracy_reward/mean": 1.8019077777862549, + "rewards/accuracy_reward/std": 3.3000130653381348, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 536.796875, + "completions/mean_terminated_length": 536.796875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.8839879154078549, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04000964015722275, + "learning_rate": 3.991795815826143e-07, + "loss": -0.0113, + "num_tokens": 248037617.0, + "reward": 1.5325984954833984, + "reward_std": 1.610664963722229, + "rewards/accuracy_reward/mean": 0.7825984954833984, + "rewards/accuracy_reward/std": 2.358302116394043, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1435.0, + "completions/max_terminated_length": 1435.0, + "completions/mean_length": 485.40625, + "completions/mean_terminated_length": 485.40625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.8845921450151057, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026351606473326683, + "learning_rate": 3.9816708565993797e-07, + "loss": 0.0215, + "num_tokens": 248168347.0, + "reward": 4.927779674530029, + "reward_std": 1.2185767889022827, + "rewards/accuracy_reward/mean": 4.177779674530029, + "rewards/accuracy_reward/std": 3.653137683868408, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 558.171875, + "completions/mean_terminated_length": 558.171875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.8851963746223565, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03922109678387642, + "learning_rate": 3.971595894020092e-07, + "loss": -0.001, + "num_tokens": 248424758.0, + "reward": 5.754775047302246, + "reward_std": 0.9037761092185974, + "rewards/accuracy_reward/mean": 5.004775047302246, + "rewards/accuracy_reward/std": 3.525344133377075, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1345.0, + "completions/max_terminated_length": 1345.0, + "completions/mean_length": 668.234375, + "completions/mean_terminated_length": 668.234375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 0.8858006042296073, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026290321722626686, + "learning_rate": 3.9615709683264225e-07, + "loss": 0.007, + "num_tokens": 248600741.0, + "reward": 3.0942718982696533, + "reward_std": 0.878045916557312, + "rewards/accuracy_reward/mean": 2.3442718982696533, + "rewards/accuracy_reward/std": 3.4747979640960693, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1357.0, + "completions/mean_length": 737.78125, + "completions/mean_terminated_length": 716.9841918945312, + "completions/min_length": 470.0, + "completions/min_terminated_length": 470.0, + "epoch": 0.886404833836858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.026037320494651794, + "learning_rate": 3.9515961195566716e-07, + "loss": -0.0182, + "num_tokens": 248735191.0, + "reward": 4.403951644897461, + "reward_std": 1.1078529357910156, + "rewards/accuracy_reward/mean": 3.665670156478882, + "rewards/accuracy_reward/std": 3.827960729598999, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1252.0, + "completions/max_terminated_length": 1252.0, + "completions/mean_length": 568.203125, + "completions/mean_terminated_length": 568.203125, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.8870090634441088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030393218621611595, + "learning_rate": 3.941671387549152e-07, + "loss": -0.0163, + "num_tokens": 248960788.0, + "reward": 0.9310327768325806, + "reward_std": 1.075052261352539, + "rewards/accuracy_reward/mean": 0.18493905663490295, + "rewards/accuracy_reward/std": 1.326626181602478, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 635.015625, + "completions/mean_terminated_length": 635.015625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.8876132930513595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0533234104514122, + "learning_rate": 3.9317968119420013e-07, + "loss": -0.0222, + "num_tokens": 249139253.0, + "reward": 4.74453592300415, + "reward_std": 1.2048609256744385, + "rewards/accuracy_reward/mean": 4.03359842300415, + "rewards/accuracy_reward/std": 3.996333360671997, + "rewards/tag_count_reward/mean": 0.7109375, + "rewards/tag_count_reward/std": 0.1423913538455963, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 610.984375, + "completions/mean_terminated_length": 610.984375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.8882175226586103, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03581336513161659, + "learning_rate": 3.9219724321730433e-07, + "loss": 0.017, + "num_tokens": 249306644.0, + "reward": 2.3702280521392822, + "reward_std": 1.8335082530975342, + "rewards/accuracy_reward/mean": 1.6202282905578613, + "rewards/accuracy_reward/std": 2.97110915184021, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 841.5, + "completions/mean_terminated_length": 841.5, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "epoch": 0.888821752265861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.031902629882097244, + "learning_rate": 3.912198287479631e-07, + "loss": -0.0044, + "num_tokens": 249514468.0, + "reward": 1.5003468990325928, + "reward_std": 1.0186758041381836, + "rewards/accuracy_reward/mean": 0.750346839427948, + "rewards/accuracy_reward/std": 2.1783864498138428, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1607.0, + "completions/mean_length": 837.59375, + "completions/mean_terminated_length": 798.54833984375, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.8894259818731118, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.007695023436099291, + "learning_rate": 3.902474416898481e-07, + "loss": -0.0155, + "num_tokens": 249676410.0, + "reward": 2.5195701122283936, + "reward_std": 0.3119346797466278, + "rewards/accuracy_reward/mean": 1.7851954698562622, + "rewards/accuracy_reward/std": 3.280883550643921, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1002.0, + "completions/max_terminated_length": 1002.0, + "completions/mean_length": 512.421875, + "completions/mean_terminated_length": 512.421875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.8900302114803625, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05922657251358032, + "learning_rate": 3.8928008592655165e-07, + "loss": 0.082, + "num_tokens": 249833813.0, + "reward": 4.867877960205078, + "reward_std": 2.2673168182373047, + "rewards/accuracy_reward/mean": 4.117878437042236, + "rewards/accuracy_reward/std": 3.761465072631836, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 897.0, + "completions/mean_length": 550.90625, + "completions/mean_terminated_length": 527.1428833007812, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.8906344410876132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03376559540629387, + "learning_rate": 3.8831776532157253e-07, + "loss": -0.0142, + "num_tokens": 250065327.0, + "reward": 5.774246692657471, + "reward_std": 1.430811882019043, + "rewards/accuracy_reward/mean": 5.035965442657471, + "rewards/accuracy_reward/std": 3.4946839809417725, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 632.3125, + "completions/mean_terminated_length": 632.3125, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "epoch": 0.8912386706948641, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028416089713573456, + "learning_rate": 3.873604837182997e-07, + "loss": 0.007, + "num_tokens": 250263443.0, + "reward": 4.167445182800293, + "reward_std": 1.0718423128128052, + "rewards/accuracy_reward/mean": 3.417445182800293, + "rewards/accuracy_reward/std": 3.7677085399627686, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 510.90625, + "completions/mean_terminated_length": 510.90625, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "epoch": 0.8918429003021148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03813767060637474, + "learning_rate": 3.864082449399963e-07, + "loss": 0.0128, + "num_tokens": 250408557.0, + "reward": 7.625864505767822, + "reward_std": 1.7135270833969116, + "rewards/accuracy_reward/mean": 6.875864505767822, + "rewards/accuracy_reward/std": 1.9915086030960083, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 604.515625, + "completions/mean_terminated_length": 604.515625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.8924471299093656, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.035430263727903366, + "learning_rate": 3.854610527897852e-07, + "loss": 0.0173, + "num_tokens": 250557838.0, + "reward": 5.089111328125, + "reward_std": 1.7899115085601807, + "rewards/accuracy_reward/mean": 4.339111328125, + "rewards/accuracy_reward/std": 3.5430808067321777, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 432.28125, + "completions/mean_terminated_length": 432.28125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.8930513595166163, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03029782511293888, + "learning_rate": 3.8451891105063417e-07, + "loss": 0.0093, + "num_tokens": 250726208.0, + "reward": 4.674130916595459, + "reward_std": 1.2757353782653809, + "rewards/accuracy_reward/mean": 3.924130916595459, + "rewards/accuracy_reward/std": 3.748934030532837, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 509.28125, + "completions/mean_terminated_length": 509.28125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "epoch": 0.8936555891238671, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0446617491543293, + "learning_rate": 3.835818234853401e-07, + "loss": 0.0086, + "num_tokens": 250915314.0, + "reward": 2.838151454925537, + "reward_std": 1.875701904296875, + "rewards/accuracy_reward/mean": 2.088151454925537, + "rewards/accuracy_reward/std": 3.349247694015503, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 533.875, + "completions/mean_terminated_length": 485.0322570800781, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.8942598187311178, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03983471915125847, + "learning_rate": 3.8264979383651364e-07, + "loss": 0.0322, + "num_tokens": 251061546.0, + "reward": 2.091463088989258, + "reward_std": 1.6741199493408203, + "rewards/accuracy_reward/mean": 1.3649004697799683, + "rewards/accuracy_reward/std": 3.05597186088562, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 503.265625, + "completions/mean_terminated_length": 503.265625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.8948640483383686, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05983623489737511, + "learning_rate": 3.817228258265655e-07, + "loss": 0.011, + "num_tokens": 251208059.0, + "reward": 4.844834327697754, + "reward_std": 2.3103342056274414, + "rewards/accuracy_reward/mean": 4.094834327697754, + "rewards/accuracy_reward/std": 3.691887378692627, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 515.234375, + "completions/mean_terminated_length": 515.234375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.8954682779456193, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0002660393947735429, + "learning_rate": 3.8080092315769015e-07, + "loss": -0.0, + "num_tokens": 251361178.0, + "reward": 4.471738815307617, + "reward_std": 0.012833312153816223, + "rewards/accuracy_reward/mean": 3.7217390537261963, + "rewards/accuracy_reward/std": 3.751230478286743, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 616.59375, + "completions/mean_terminated_length": 616.59375, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "epoch": 0.89607250755287, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.038154177367687225, + "learning_rate": 3.798840895118521e-07, + "loss": -0.0004, + "num_tokens": 251592944.0, + "reward": 3.781461000442505, + "reward_std": 1.4718167781829834, + "rewards/accuracy_reward/mean": 3.031461000442505, + "rewards/accuracy_reward/std": 3.6938695907592773, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 582.21875, + "completions/mean_terminated_length": 558.952392578125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "epoch": 0.8966767371601209, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053857989609241486, + "learning_rate": 3.789723285507711e-07, + "loss": -0.029, + "num_tokens": 251757150.0, + "reward": 4.895178318023682, + "reward_std": 2.8681511878967285, + "rewards/accuracy_reward/mean": 4.156897068023682, + "rewards/accuracy_reward/std": 3.9131851196289062, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 479.953125, + "completions/mean_terminated_length": 479.953125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8972809667673716, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03428468480706215, + "learning_rate": 3.780656439159063e-07, + "loss": 0.0025, + "num_tokens": 251896667.0, + "reward": 4.926693916320801, + "reward_std": 1.3570873737335205, + "rewards/accuracy_reward/mean": 4.176693916320801, + "rewards/accuracy_reward/std": 3.7126853466033936, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 630.328125, + "completions/mean_terminated_length": 630.328125, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "epoch": 0.8978851963746224, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04964809864759445, + "learning_rate": 3.771640392284436e-07, + "loss": -0.0144, + "num_tokens": 252081616.0, + "reward": 5.725478172302246, + "reward_std": 2.184476375579834, + "rewards/accuracy_reward/mean": 4.975478172302246, + "rewards/accuracy_reward/std": 3.5403664112091064, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 531.421875, + "completions/mean_terminated_length": 531.421875, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "epoch": 0.8984894259818731, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05182136222720146, + "learning_rate": 3.762675180892793e-07, + "loss": 0.0269, + "num_tokens": 252251371.0, + "reward": 5.2522125244140625, + "reward_std": 2.264435291290283, + "rewards/accuracy_reward/mean": 4.5022125244140625, + "rewards/accuracy_reward/std": 3.6005301475524902, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 641.078125, + "completions/mean_terminated_length": 618.74609375, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.8990936555891239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07248283922672272, + "learning_rate": 3.753760840790081e-07, + "loss": 0.0294, + "num_tokens": 252428720.0, + "reward": 4.410835266113281, + "reward_std": 2.0556674003601074, + "rewards/accuracy_reward/mean": 3.6725540161132812, + "rewards/accuracy_reward/std": 3.7783193588256836, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 531.9375, + "completions/mean_terminated_length": 531.9375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.8996978851963746, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06782865524291992, + "learning_rate": 3.7448974075790573e-07, + "loss": 0.0574, + "num_tokens": 252618252.0, + "reward": 4.506891250610352, + "reward_std": 1.780167579650879, + "rewards/accuracy_reward/mean": 3.756891965866089, + "rewards/accuracy_reward/std": 3.737361431121826, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 521.203125, + "completions/mean_terminated_length": 521.203125, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.9003021148036254, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.023085838183760643, + "learning_rate": 3.736084916659171e-07, + "loss": -0.0094, + "num_tokens": 252746025.0, + "reward": 7.825087547302246, + "reward_std": 0.808528482913971, + "rewards/accuracy_reward/mean": 7.075087547302246, + "rewards/accuracy_reward/std": 1.6595041751861572, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1440.0, + "completions/max_terminated_length": 1440.0, + "completions/mean_length": 760.84375, + "completions/mean_terminated_length": 760.84375, + "completions/min_length": 451.0, + "completions/min_terminated_length": 451.0, + "epoch": 0.9009063444108761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03574606776237488, + "learning_rate": 3.727323403226415e-07, + "loss": -0.0357, + "num_tokens": 252930287.0, + "reward": 2.7225234508514404, + "reward_std": 1.4392492771148682, + "rewards/accuracy_reward/mean": 1.9725234508514404, + "rewards/accuracy_reward/std": 3.1240694522857666, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 923.0, + "completions/max_terminated_length": 923.0, + "completions/mean_length": 578.953125, + "completions/mean_terminated_length": 578.953125, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "epoch": 0.9015105740181268, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05404764786362648, + "learning_rate": 3.7186129022731825e-07, + "loss": -0.0246, + "num_tokens": 253067804.0, + "reward": 5.988804817199707, + "reward_std": 2.3555431365966797, + "rewards/accuracy_reward/mean": 5.238804817199707, + "rewards/accuracy_reward/std": 3.4153623580932617, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 519.484375, + "completions/mean_terminated_length": 519.484375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.9021148036253777, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04753565043210983, + "learning_rate": 3.709953448588129e-07, + "loss": -0.0413, + "num_tokens": 253265051.0, + "reward": 6.432906150817871, + "reward_std": 2.3781888484954834, + "rewards/accuracy_reward/mean": 5.682906150817871, + "rewards/accuracy_reward/std": 3.169374465942383, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 598.90625, + "completions/mean_terminated_length": 575.90478515625, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.9027190332326284, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03963141143321991, + "learning_rate": 3.701345076756031e-07, + "loss": -0.0501, + "num_tokens": 253490197.0, + "reward": 4.098782539367676, + "reward_std": 1.3600687980651855, + "rewards/accuracy_reward/mean": 3.360501766204834, + "rewards/accuracy_reward/std": 3.7547595500946045, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 457.734375, + "completions/mean_terminated_length": 457.734375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.9033232628398792, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.019677894189953804, + "learning_rate": 3.6927878211576586e-07, + "loss": 0.0135, + "num_tokens": 253659444.0, + "reward": 4.455293655395508, + "reward_std": 0.926327645778656, + "rewards/accuracy_reward/mean": 3.705293655395508, + "rewards/accuracy_reward/std": 3.7347278594970703, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 595.671875, + "completions/mean_terminated_length": 595.671875, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.9039274924471299, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.024148648604750633, + "learning_rate": 3.6842817159696236e-07, + "loss": 0.014, + "num_tokens": 253796943.0, + "reward": 4.418800354003906, + "reward_std": 0.7154778838157654, + "rewards/accuracy_reward/mean": 3.6688003540039062, + "rewards/accuracy_reward/std": 3.561162233352661, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 544.96875, + "completions/mean_terminated_length": 544.96875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "epoch": 0.9045317220543807, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02903926372528076, + "learning_rate": 3.6758267951642465e-07, + "loss": 0.0017, + "num_tokens": 253966573.0, + "reward": 4.827471733093262, + "reward_std": 0.7654163241386414, + "rewards/accuracy_reward/mean": 4.077471733093262, + "rewards/accuracy_reward/std": 3.740967035293579, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1578.0, + "completions/max_terminated_length": 1578.0, + "completions/mean_length": 681.171875, + "completions/mean_terminated_length": 681.171875, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "epoch": 0.9051359516616314, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05504172295331955, + "learning_rate": 3.667423092509432e-07, + "loss": -0.0272, + "num_tokens": 254196888.0, + "reward": 1.9646670818328857, + "reward_std": 2.4292962551116943, + "rewards/accuracy_reward/mean": 1.2146672010421753, + "rewards/accuracy_reward/std": 2.871938943862915, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 433.953125, + "completions/mean_terminated_length": 433.953125, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.9057401812688822, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.049027666449546814, + "learning_rate": 3.659070641568523e-07, + "loss": 0.0107, + "num_tokens": 254449557.0, + "reward": 6.984636306762695, + "reward_std": 1.7779322862625122, + "rewards/accuracy_reward/mean": 6.234635829925537, + "rewards/accuracy_reward/std": 2.7048885822296143, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 651.96875, + "completions/mean_terminated_length": 651.96875, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "epoch": 0.9063444108761329, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0357978492975235, + "learning_rate": 3.650769475700163e-07, + "loss": 0.009, + "num_tokens": 254609107.0, + "reward": 3.4706215858459473, + "reward_std": 1.4456305503845215, + "rewards/accuracy_reward/mean": 2.7206218242645264, + "rewards/accuracy_reward/std": 3.588421106338501, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 423.28125, + "completions/mean_terminated_length": 423.28125, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.9069486404833836, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0004325883637648076, + "learning_rate": 3.642519628058177e-07, + "loss": -0.0006, + "num_tokens": 254733989.0, + "reward": 4.453298568725586, + "reward_std": 0.018916074186563492, + "rewards/accuracy_reward/mean": 3.703298568725586, + "rewards/accuracy_reward/std": 3.732668399810791, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 543.578125, + "completions/mean_terminated_length": 543.578125, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.9075528700906345, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04632469639182091, + "learning_rate": 3.634321131591433e-07, + "loss": 0.0302, + "num_tokens": 254978890.0, + "reward": 3.4796907901763916, + "reward_std": 2.0515432357788086, + "rewards/accuracy_reward/mean": 2.7296907901763916, + "rewards/accuracy_reward/std": 3.510287046432495, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 597.859375, + "completions/mean_terminated_length": 597.859375, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.9081570996978852, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03929077461361885, + "learning_rate": 3.626174019043702e-07, + "loss": 0.0154, + "num_tokens": 255114721.0, + "reward": 5.056653022766113, + "reward_std": 1.6227726936340332, + "rewards/accuracy_reward/mean": 4.306652069091797, + "rewards/accuracy_reward/std": 3.642812490463257, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 1228.0, + "completions/mean_length": 678.546875, + "completions/mean_terminated_length": 678.546875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "epoch": 0.908761329305136, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0586201436817646, + "learning_rate": 3.618078322953533e-07, + "loss": -0.0036, + "num_tokens": 255316900.0, + "reward": 6.3534417152404785, + "reward_std": 2.5989646911621094, + "rewards/accuracy_reward/mean": 5.603442192077637, + "rewards/accuracy_reward/std": 3.188825845718384, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 553.625, + "completions/mean_terminated_length": 553.625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.9093655589123867, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.039728522300720215, + "learning_rate": 3.610034075654135e-07, + "loss": 0.0186, + "num_tokens": 255475084.0, + "reward": 3.2256529331207275, + "reward_std": 1.3987857103347778, + "rewards/accuracy_reward/mean": 2.4756531715393066, + "rewards/accuracy_reward/std": 3.6231460571289062, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 599.21875, + "completions/mean_terminated_length": 599.21875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.9099697885196375, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04256278648972511, + "learning_rate": 3.602041309273224e-07, + "loss": 0.0341, + "num_tokens": 255627322.0, + "reward": 3.0776047706604004, + "reward_std": 2.110593318939209, + "rewards/accuracy_reward/mean": 2.3276047706604004, + "rewards/accuracy_reward/std": 3.4167442321777344, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1310.0, + "completions/max_terminated_length": 1310.0, + "completions/mean_length": 629.515625, + "completions/mean_terminated_length": 629.515625, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.9105740181268882, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030843326821923256, + "learning_rate": 3.5941000557329136e-07, + "loss": -0.0001, + "num_tokens": 255815227.0, + "reward": 5.885879993438721, + "reward_std": 1.3718475103378296, + "rewards/accuracy_reward/mean": 5.135879993438721, + "rewards/accuracy_reward/std": 3.474277973175049, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 580.90625, + "completions/mean_terminated_length": 580.90625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.911178247734139, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07049828767776489, + "learning_rate": 3.586210346749586e-07, + "loss": -0.0636, + "num_tokens": 255989317.0, + "reward": 3.70469069480896, + "reward_std": 3.594238758087158, + "rewards/accuracy_reward/mean": 2.95469069480896, + "rewards/accuracy_reward/std": 3.7362418174743652, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 631.84375, + "completions/mean_terminated_length": 631.84375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.9117824773413897, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03283466398715973, + "learning_rate": 3.578372213833754e-07, + "loss": -0.0008, + "num_tokens": 256154011.0, + "reward": 4.8132829666137695, + "reward_std": 1.128592610359192, + "rewards/accuracy_reward/mean": 4.0632829666137695, + "rewards/accuracy_reward/std": 3.7616236209869385, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 483.875, + "completions/mean_terminated_length": 483.875, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.9123867069486404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.037593815475702286, + "learning_rate": 3.570585688289942e-07, + "loss": 0.0055, + "num_tokens": 256309203.0, + "reward": 4.523220062255859, + "reward_std": 2.1750762462615967, + "rewards/accuracy_reward/mean": 3.7732203006744385, + "rewards/accuracy_reward/std": 3.819355010986328, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 670.203125, + "completions/mean_terminated_length": 648.3333740234375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "epoch": 0.9129909365558913, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05215397849678993, + "learning_rate": 3.5628508012165655e-07, + "loss": -0.0252, + "num_tokens": 256481552.0, + "reward": 3.7581124305725098, + "reward_std": 2.250570774078369, + "rewards/accuracy_reward/mean": 3.0198311805725098, + "rewards/accuracy_reward/std": 3.675096035003662, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 450.109375, + "completions/mean_terminated_length": 424.7460632324219, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.913595166163142, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04629223793745041, + "learning_rate": 3.5551675835057994e-07, + "loss": -0.0187, + "num_tokens": 256633015.0, + "reward": 2.811981201171875, + "reward_std": 1.962374210357666, + "rewards/accuracy_reward/mean": 2.073699951171875, + "rewards/accuracy_reward/std": 3.272162914276123, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 484.4375, + "completions/mean_terminated_length": 484.4375, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.9141993957703928, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.00024055680842138827, + "learning_rate": 3.547536065843458e-07, + "loss": -0.0, + "num_tokens": 256778435.0, + "reward": 2.617926597595215, + "reward_std": 0.007721965666860342, + "rewards/accuracy_reward/mean": 1.8679265975952148, + "rewards/accuracy_reward/std": 3.2609548568725586, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 540.8125, + "completions/mean_terminated_length": 540.8125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.9148036253776435, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034705083817243576, + "learning_rate": 3.539956278708873e-07, + "loss": -0.0018, + "num_tokens": 256922199.0, + "reward": 6.4754958152771, + "reward_std": 1.4952977895736694, + "rewards/accuracy_reward/mean": 5.7254958152771, + "rewards/accuracy_reward/std": 3.422837972640991, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 607.875, + "completions/mean_terminated_length": 607.875, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "epoch": 0.9154078549848943, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04116547107696533, + "learning_rate": 3.5324282523747705e-07, + "loss": 0.0106, + "num_tokens": 257144447.0, + "reward": 6.316357612609863, + "reward_std": 1.8542405366897583, + "rewards/accuracy_reward/mean": 5.5702643394470215, + "rewards/accuracy_reward/std": 3.2416956424713135, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1682.0, + "completions/max_terminated_length": 1682.0, + "completions/mean_length": 685.046875, + "completions/mean_terminated_length": 685.046875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.916012084592145, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02509310469031334, + "learning_rate": 3.524952016907151e-07, + "loss": -0.0018, + "num_tokens": 257341282.0, + "reward": 2.202293872833252, + "reward_std": 0.9245275855064392, + "rewards/accuracy_reward/mean": 1.4522937536239624, + "rewards/accuracy_reward/std": 3.082573413848877, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 754.0, + "completions/max_terminated_length": 754.0, + "completions/mean_length": 540.375, + "completions/mean_terminated_length": 540.375, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "epoch": 0.9166163141993958, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0251467265188694, + "learning_rate": 3.5175276021651635e-07, + "loss": -0.0092, + "num_tokens": 257518682.0, + "reward": 7.861323356628418, + "reward_std": 1.1195390224456787, + "rewards/accuracy_reward/mean": 7.111323356628418, + "rewards/accuracy_reward/std": 1.589800238609314, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 507.015625, + "completions/mean_terminated_length": 507.015625, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.9172205438066465, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04051586613059044, + "learning_rate": 3.5101550378010016e-07, + "loss": 0.0195, + "num_tokens": 257654555.0, + "reward": 4.852538585662842, + "reward_std": 1.4210419654846191, + "rewards/accuracy_reward/mean": 4.1064453125, + "rewards/accuracy_reward/std": 3.718155860900879, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 572.984375, + "completions/mean_terminated_length": 572.984375, + "completions/min_length": 415.0, + "completions/min_terminated_length": 415.0, + "epoch": 0.9178247734138972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03941365331411362, + "learning_rate": 3.5028343532597656e-07, + "loss": 0.0071, + "num_tokens": 257786986.0, + "reward": 6.633934497833252, + "reward_std": 1.356370449066162, + "rewards/accuracy_reward/mean": 5.883934020996094, + "rewards/accuracy_reward/std": 3.036314010620117, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 559.46875, + "completions/mean_terminated_length": 559.46875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "epoch": 0.918429003021148, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05870204046368599, + "learning_rate": 3.4955655777793557e-07, + "loss": -0.0202, + "num_tokens": 257921800.0, + "reward": 6.441534042358398, + "reward_std": 2.8034753799438477, + "rewards/accuracy_reward/mean": 5.691534042358398, + "rewards/accuracy_reward/std": 3.18495512008667, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 485.03125, + "completions/mean_terminated_length": 485.03125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.9190332326283988, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03991251811385155, + "learning_rate": 3.4883487403903613e-07, + "loss": 0.0128, + "num_tokens": 258030010.0, + "reward": 6.280635833740234, + "reward_std": 2.045226573944092, + "rewards/accuracy_reward/mean": 5.530635833740234, + "rewards/accuracy_reward/std": 3.2188241481781006, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 637.015625, + "completions/mean_terminated_length": 637.015625, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.9196374622356496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03892865777015686, + "learning_rate": 3.481183869915931e-07, + "loss": 0.0236, + "num_tokens": 258197499.0, + "reward": 5.0621514320373535, + "reward_std": 2.035280227661133, + "rewards/accuracy_reward/mean": 4.3121514320373535, + "rewards/accuracy_reward/std": 3.7819364070892334, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 749.0, + "completions/max_terminated_length": 749.0, + "completions/mean_length": 521.46875, + "completions/mean_terminated_length": 521.46875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.9202416918429003, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04825363680720329, + "learning_rate": 3.474070994971661e-07, + "loss": -0.0029, + "num_tokens": 258319433.0, + "reward": 2.8210015296936035, + "reward_std": 2.242851734161377, + "rewards/accuracy_reward/mean": 2.0710017681121826, + "rewards/accuracy_reward/std": 3.2175939083099365, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 598.125, + "completions/mean_terminated_length": 598.125, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.9208459214501511, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.028851494193077087, + "learning_rate": 3.4670101439654904e-07, + "loss": 0.0018, + "num_tokens": 258472705.0, + "reward": 3.8203563690185547, + "reward_std": 0.8826039433479309, + "rewards/accuracy_reward/mean": 3.0703563690185547, + "rewards/accuracy_reward/std": 3.6556601524353027, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1351.0, + "completions/max_terminated_length": 1351.0, + "completions/mean_length": 616.71875, + "completions/mean_terminated_length": 616.71875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.9214501510574018, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03355778753757477, + "learning_rate": 3.4600013450975794e-07, + "loss": 0.0057, + "num_tokens": 258623727.0, + "reward": 1.8982219696044922, + "reward_std": 1.1144071817398071, + "rewards/accuracy_reward/mean": 1.1521281003952026, + "rewards/accuracy_reward/std": 2.9198505878448486, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 595.125, + "completions/mean_terminated_length": 595.125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.9220543806646526, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.001969860168173909, + "learning_rate": 3.4530446263601977e-07, + "loss": -0.0012, + "num_tokens": 258781975.0, + "reward": 2.635939121246338, + "reward_std": 0.10022678226232529, + "rewards/accuracy_reward/mean": 1.8898452520370483, + "rewards/accuracy_reward/std": 3.207984685897827, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 531.875, + "completions/mean_terminated_length": 531.875, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "epoch": 0.9226586102719033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02379211224615574, + "learning_rate": 3.446140015537611e-07, + "loss": -0.0087, + "num_tokens": 258952975.0, + "reward": 4.164968967437744, + "reward_std": 0.8069199323654175, + "rewards/accuracy_reward/mean": 3.413015842437744, + "rewards/accuracy_reward/std": 3.7629408836364746, + "rewards/tag_count_reward/mean": 0.751953125, + "rewards/tag_count_reward/std": 0.015625, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 487.90625, + "completions/mean_terminated_length": 487.90625, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.923262839879154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.047425005584955215, + "learning_rate": 3.4392875402059763e-07, + "loss": -0.016, + "num_tokens": 259127785.0, + "reward": 5.25485897064209, + "reward_std": 2.8012185096740723, + "rewards/accuracy_reward/mean": 4.504859447479248, + "rewards/accuracy_reward/std": 3.703890323638916, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 805.0, + "completions/max_terminated_length": 805.0, + "completions/mean_length": 551.046875, + "completions/mean_terminated_length": 551.046875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.9238670694864048, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05090116709470749, + "learning_rate": 3.432487227733229e-07, + "loss": 0.0193, + "num_tokens": 259266540.0, + "reward": 5.2651872634887695, + "reward_std": 2.3032758235931396, + "rewards/accuracy_reward/mean": 4.5151872634887695, + "rewards/accuracy_reward/std": 3.628765344619751, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 646.953125, + "completions/mean_terminated_length": 646.953125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.9244712990936556, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0619870200753212, + "learning_rate": 3.4257391052789695e-07, + "loss": 0.0237, + "num_tokens": 259449305.0, + "reward": 5.503604888916016, + "reward_std": 3.186581611633301, + "rewards/accuracy_reward/mean": 4.753604888916016, + "rewards/accuracy_reward/std": 3.6093902587890625, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 467.609375, + "completions/mean_terminated_length": 467.609375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.9250755287009064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03975936025381088, + "learning_rate": 3.419043199794355e-07, + "loss": 0.0172, + "num_tokens": 259655424.0, + "reward": 5.593832969665527, + "reward_std": 1.6204934120178223, + "rewards/accuracy_reward/mean": 4.843832969665527, + "rewards/accuracy_reward/std": 3.56887149810791, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 650.59375, + "completions/mean_terminated_length": 650.59375, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.9256797583081571, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027067109942436218, + "learning_rate": 3.412399538022001e-07, + "loss": -0.0055, + "num_tokens": 259807590.0, + "reward": 6.099698066711426, + "reward_std": 0.7756175994873047, + "rewards/accuracy_reward/mean": 5.349698543548584, + "rewards/accuracy_reward/std": 3.256389856338501, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 655.03125, + "completions/mean_terminated_length": 632.920654296875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "epoch": 0.9262839879154079, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0489625558257103, + "learning_rate": 3.405808146495866e-07, + "loss": -0.0437, + "num_tokens": 259953400.0, + "reward": 3.684417247772217, + "reward_std": 1.0934860706329346, + "rewards/accuracy_reward/mean": 2.946135997772217, + "rewards/accuracy_reward/std": 3.652371406555176, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 426.59375, + "completions/mean_terminated_length": 426.59375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.9268882175226586, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.027166316285729408, + "learning_rate": 3.399269051541142e-07, + "loss": 0.0037, + "num_tokens": 260128878.0, + "reward": 2.3562936782836914, + "reward_std": 0.6957071423530579, + "rewards/accuracy_reward/mean": 1.6062936782836914, + "rewards/accuracy_reward/std": 3.1002705097198486, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 515.265625, + "completions/mean_terminated_length": 490.9365234375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.9274924471299094, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.037267930805683136, + "learning_rate": 3.392782279274166e-07, + "loss": -0.0249, + "num_tokens": 260277663.0, + "reward": 1.969273567199707, + "reward_std": 1.9148542881011963, + "rewards/accuracy_reward/mean": 1.2427109479904175, + "rewards/accuracy_reward/std": 2.8382561206817627, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 599.234375, + "completions/mean_terminated_length": 599.234375, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "epoch": 0.9280966767371601, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04597286880016327, + "learning_rate": 3.3863478556022955e-07, + "loss": 0.0059, + "num_tokens": 260439870.0, + "reward": 1.8618484735488892, + "reward_std": 2.4534964561462402, + "rewards/accuracy_reward/mean": 1.1118484735488892, + "rewards/accuracy_reward/std": 2.9314935207366943, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 545.84375, + "completions/mean_terminated_length": 545.84375, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "epoch": 0.9287009063444108, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.040475260466337204, + "learning_rate": 3.379965806223815e-07, + "loss": -0.0347, + "num_tokens": 260625060.0, + "reward": 3.7530388832092285, + "reward_std": 1.631518840789795, + "rewards/accuracy_reward/mean": 3.0030391216278076, + "rewards/accuracy_reward/std": 3.5994670391082764, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1027.0, + "completions/max_terminated_length": 1027.0, + "completions/mean_length": 537.609375, + "completions/mean_terminated_length": 537.609375, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.9293051359516616, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.021965214982628822, + "learning_rate": 3.3736361566278405e-07, + "loss": -0.0119, + "num_tokens": 260772875.0, + "reward": 2.923651695251465, + "reward_std": 0.6335277557373047, + "rewards/accuracy_reward/mean": 2.1736514568328857, + "rewards/accuracy_reward/std": 3.2635903358459473, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 806.0, + "completions/max_terminated_length": 806.0, + "completions/mean_length": 451.5, + "completions/mean_terminated_length": 451.5, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.9299093655589123, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034181151539087296, + "learning_rate": 3.3673589320941996e-07, + "loss": -0.0166, + "num_tokens": 260917643.0, + "reward": 5.270630359649658, + "reward_std": 0.9364380836486816, + "rewards/accuracy_reward/mean": 4.5206298828125, + "rewards/accuracy_reward/std": 3.536712646484375, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 538.5625, + "completions/mean_terminated_length": 538.5625, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.9305135951661632, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.048908233642578125, + "learning_rate": 3.361134157693344e-07, + "loss": 0.0123, + "num_tokens": 261047135.0, + "reward": 5.166626453399658, + "reward_std": 1.8777892589569092, + "rewards/accuracy_reward/mean": 4.416626453399658, + "rewards/accuracy_reward/std": 3.5874173641204834, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 625.125, + "completions/mean_terminated_length": 579.2257690429688, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.9311178247734139, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.013051273301243782, + "learning_rate": 3.354961858286252e-07, + "loss": -0.0681, + "num_tokens": 261197239.0, + "reward": 2.1315126419067383, + "reward_std": 0.9177470803260803, + "rewards/accuracy_reward/mean": 1.4088562726974487, + "rewards/accuracy_reward/std": 3.1927733421325684, + "rewards/tag_count_reward/mean": 0.72265625, + "rewards/tag_count_reward/std": 0.13449780642986298, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 989.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 603.28125, + "completions/mean_terminated_length": 603.28125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "epoch": 0.9317220543806647, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03742845728993416, + "learning_rate": 3.348842058524318e-07, + "loss": 0.0111, + "num_tokens": 261372729.0, + "reward": 5.651739120483398, + "reward_std": 1.5225462913513184, + "rewards/accuracy_reward/mean": 4.901739120483398, + "rewards/accuracy_reward/std": 3.540447473526001, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 665.3125, + "completions/mean_terminated_length": 573.1333618164062, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.9323262839879154, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03427108749747276, + "learning_rate": 3.3427747828492575e-07, + "loss": -0.0585, + "num_tokens": 261522781.0, + "reward": 5.683241844177246, + "reward_std": 1.1377627849578857, + "rewards/accuracy_reward/mean": 4.980117321014404, + "rewards/accuracy_reward/std": 3.7095134258270264, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.18298126757144928, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 582.53125, + "completions/mean_terminated_length": 582.53125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.9329305135951662, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04822336509823799, + "learning_rate": 3.336760055493013e-07, + "loss": -0.0105, + "num_tokens": 261687087.0, + "reward": 6.267057418823242, + "reward_std": 2.6616969108581543, + "rewards/accuracy_reward/mean": 5.5170578956604, + "rewards/accuracy_reward/std": 3.2459802627563477, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 631.09375, + "completions/mean_terminated_length": 631.09375, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.9335347432024169, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030460676178336143, + "learning_rate": 3.330797900477661e-07, + "loss": -0.0233, + "num_tokens": 261848101.0, + "reward": 3.0440969467163086, + "reward_std": 1.4839940071105957, + "rewards/accuracy_reward/mean": 2.2940969467163086, + "rewards/accuracy_reward/std": 3.5019173622131348, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 523.90625, + "completions/mean_terminated_length": 523.90625, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 0.9341389728096676, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05347742512822151, + "learning_rate": 3.324888341615304e-07, + "loss": 0.0239, + "num_tokens": 262027775.0, + "reward": 6.495532989501953, + "reward_std": 2.4490549564361572, + "rewards/accuracy_reward/mean": 5.745532989501953, + "rewards/accuracy_reward/std": 3.105539560317993, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 567.375, + "completions/mean_terminated_length": 567.375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.9347432024169184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06642554700374603, + "learning_rate": 3.319031402507981e-07, + "loss": 0.0057, + "num_tokens": 262235783.0, + "reward": 6.2094621658325195, + "reward_std": 1.4352972507476807, + "rewards/accuracy_reward/mean": 5.4594621658325195, + "rewards/accuracy_reward/std": 3.3320059776306152, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 734.0, + "completions/max_terminated_length": 734.0, + "completions/mean_length": 447.59375, + "completions/mean_terminated_length": 447.59375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.9353474320241691, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03293062746524811, + "learning_rate": 3.313227106547582e-07, + "loss": -0.0113, + "num_tokens": 262380237.0, + "reward": 5.13388729095459, + "reward_std": 1.4228206872940063, + "rewards/accuracy_reward/mean": 4.38388729095459, + "rewards/accuracy_reward/std": 3.710380792617798, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 487.203125, + "completions/mean_terminated_length": 487.203125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.93595166163142, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.012343481183052063, + "learning_rate": 3.30747547691574e-07, + "loss": -0.0005, + "num_tokens": 262526778.0, + "reward": 6.208279609680176, + "reward_std": 0.46914663910865784, + "rewards/accuracy_reward/mean": 5.458279609680176, + "rewards/accuracy_reward/std": 3.3089654445648193, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 636.140625, + "completions/mean_terminated_length": 590.5967407226562, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.9365558912386707, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04918146878480911, + "learning_rate": 3.301776536583747e-07, + "loss": -0.0605, + "num_tokens": 262697779.0, + "reward": 2.1951375007629395, + "reward_std": 1.9746102094650269, + "rewards/accuracy_reward/mean": 1.46857488155365, + "rewards/accuracy_reward/std": 3.0383493900299072, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 679.0, + "completions/max_terminated_length": 679.0, + "completions/mean_length": 499.96875, + "completions/mean_terminated_length": 499.96875, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.9371601208459215, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04204342141747475, + "learning_rate": 3.296130308312462e-07, + "loss": 0.0025, + "num_tokens": 262853009.0, + "reward": 3.893489122390747, + "reward_std": 1.3962645530700684, + "rewards/accuracy_reward/mean": 3.143489360809326, + "rewards/accuracy_reward/std": 3.7090883255004883, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 786.0, + "completions/max_terminated_length": 786.0, + "completions/mean_length": 519.640625, + "completions/mean_terminated_length": 519.640625, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.9377643504531722, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.000298726256005466, + "learning_rate": 3.290536814652216e-07, + "loss": -0.0002, + "num_tokens": 263022234.0, + "reward": 2.6863720417022705, + "reward_std": 0.013580668717622757, + "rewards/accuracy_reward/mean": 1.936371922492981, + "rewards/accuracy_reward/std": 3.192678451538086, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 501.78125, + "completions/mean_terminated_length": 501.78125, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.938368580060423, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05000276863574982, + "learning_rate": 3.284996077942728e-07, + "loss": -0.009, + "num_tokens": 263228172.0, + "reward": 3.6538214683532715, + "reward_std": 1.5850917100906372, + "rewards/accuracy_reward/mean": 2.9077277183532715, + "rewards/accuracy_reward/std": 3.779747724533081, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1223.0, + "completions/mean_length": 538.875, + "completions/mean_terminated_length": 514.920654296875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.9389728096676737, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05056269094347954, + "learning_rate": 3.279508120313007e-07, + "loss": -0.0027, + "num_tokens": 263396660.0, + "reward": 4.422524929046631, + "reward_std": 2.716219425201416, + "rewards/accuracy_reward/mean": 3.695962429046631, + "rewards/accuracy_reward/std": 3.7921669483184814, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 563.859375, + "completions/mean_terminated_length": 563.859375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.9395770392749244, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026477178558707237, + "learning_rate": 3.2740729636812754e-07, + "loss": 0.0017, + "num_tokens": 263623723.0, + "reward": 3.807812452316284, + "reward_std": 0.9458088874816895, + "rewards/accuracy_reward/mean": 3.057812452316284, + "rewards/accuracy_reward/std": 3.672926664352417, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 482.765625, + "completions/mean_terminated_length": 482.765625, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.9401812688821752, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0004101917438674718, + "learning_rate": 3.268690629754872e-07, + "loss": -0.0, + "num_tokens": 263766204.0, + "reward": 6.342182636260986, + "reward_std": 0.02273348905146122, + "rewards/accuracy_reward/mean": 5.592182636260986, + "rewards/accuracy_reward/std": 3.2543134689331055, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 537.03125, + "completions/mean_terminated_length": 537.03125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.9407854984894259, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.056678127497434616, + "learning_rate": 3.263361140030167e-07, + "loss": 0.0296, + "num_tokens": 263936526.0, + "reward": 6.119973182678223, + "reward_std": 2.711480140686035, + "rewards/accuracy_reward/mean": 5.369973182678223, + "rewards/accuracy_reward/std": 3.434091806411743, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 540.078125, + "completions/mean_terminated_length": 540.078125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.9413897280966768, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030973758548498154, + "learning_rate": 3.2580845157924784e-07, + "loss": 0.0022, + "num_tokens": 264109539.0, + "reward": 4.052429676055908, + "reward_std": 0.8092271089553833, + "rewards/accuracy_reward/mean": 3.302429676055908, + "rewards/accuracy_reward/std": 3.69048810005188, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 604.015625, + "completions/mean_terminated_length": 557.4354858398438, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.9419939577039275, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.046261612325906754, + "learning_rate": 3.252860778115987e-07, + "loss": -0.0417, + "num_tokens": 264239076.0, + "reward": 7.083416938781738, + "reward_std": 2.231435775756836, + "rewards/accuracy_reward/mean": 6.356854438781738, + "rewards/accuracy_reward/std": 2.7336649894714355, + "rewards/tag_count_reward/mean": 0.7265625, + "rewards/tag_count_reward/std": 0.13152606785297394, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 506.1875, + "completions/mean_terminated_length": 506.1875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.9425981873111783, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030582044273614883, + "learning_rate": 3.247689947863649e-07, + "loss": -0.0086, + "num_tokens": 264428208.0, + "reward": 5.734304428100586, + "reward_std": 1.4356428384780884, + "rewards/accuracy_reward/mean": 4.984304428100586, + "rewards/accuracy_reward/std": 3.5465409755706787, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 762.0, + "completions/max_terminated_length": 762.0, + "completions/mean_length": 510.375, + "completions/mean_terminated_length": 510.375, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.943202416918429, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04753235727548599, + "learning_rate": 3.242572045687117e-07, + "loss": -0.0081, + "num_tokens": 264585560.0, + "reward": 5.016015529632568, + "reward_std": 2.9484524726867676, + "rewards/accuracy_reward/mean": 4.269921779632568, + "rewards/accuracy_reward/std": 3.710547924041748, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 599.859375, + "completions/mean_terminated_length": 599.859375, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.9438066465256798, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04429687187075615, + "learning_rate": 3.2375070920266576e-07, + "loss": -0.0064, + "num_tokens": 264766335.0, + "reward": 3.802985906600952, + "reward_std": 1.995648741722107, + "rewards/accuracy_reward/mean": 3.052985906600952, + "rewards/accuracy_reward/std": 3.6189608573913574, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 793.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 503.4375, + "completions/mean_terminated_length": 503.4375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.9444108761329305, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.046332068741321564, + "learning_rate": 3.2324951071110614e-07, + "loss": 0.0172, + "num_tokens": 264913611.0, + "reward": 4.800355434417725, + "reward_std": 1.448991060256958, + "rewards/accuracy_reward/mean": 4.050355911254883, + "rewards/accuracy_reward/std": 3.866156578063965, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 928.0, + "completions/max_terminated_length": 928.0, + "completions/mean_length": 455.59375, + "completions/mean_terminated_length": 455.59375, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.9450151057401812, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03836679831147194, + "learning_rate": 3.227536110957572e-07, + "loss": -0.0187, + "num_tokens": 265089473.0, + "reward": 4.593099594116211, + "reward_std": 1.8125536441802979, + "rewards/accuracy_reward/mean": 3.84309983253479, + "rewards/accuracy_reward/std": 3.714583158493042, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 559.34375, + "completions/mean_terminated_length": 559.34375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.945619335347432, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0323515385389328, + "learning_rate": 3.2226301233718047e-07, + "loss": -0.0221, + "num_tokens": 265247767.0, + "reward": 6.523523330688477, + "reward_std": 1.651716947555542, + "rewards/accuracy_reward/mean": 5.773523330688477, + "rewards/accuracy_reward/std": 3.052743673324585, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 494.484375, + "completions/mean_terminated_length": 469.8254089355469, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.9462235649546827, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05040956288576126, + "learning_rate": 3.217777163947661e-07, + "loss": 0.0023, + "num_tokens": 265401030.0, + "reward": 7.183645248413086, + "reward_std": 1.6514136791229248, + "rewards/accuracy_reward/mean": 6.445363998413086, + "rewards/accuracy_reward/std": 2.5065388679504395, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1319.0, + "completions/max_terminated_length": 1319.0, + "completions/mean_length": 597.828125, + "completions/mean_terminated_length": 597.828125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.9468277945619336, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04174056649208069, + "learning_rate": 3.2129772520672565e-07, + "loss": 0.0028, + "num_tokens": 265595371.0, + "reward": 1.5168390274047852, + "reward_std": 1.3337008953094482, + "rewards/accuracy_reward/mean": 0.7668390870094299, + "rewards/accuracy_reward/std": 2.1814756393432617, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 557.53125, + "completions/mean_terminated_length": 557.53125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.9474320241691843, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02034536935389042, + "learning_rate": 3.208230406900842e-07, + "loss": -0.0016, + "num_tokens": 265773293.0, + "reward": 4.29223108291626, + "reward_std": 0.697108268737793, + "rewards/accuracy_reward/mean": 3.5422310829162598, + "rewards/accuracy_reward/std": 3.673987627029419, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 482.84375, + "completions/mean_terminated_length": 482.84375, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "epoch": 0.9480362537764351, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0004767461505252868, + "learning_rate": 3.203536647406728e-07, + "loss": -0.0002, + "num_tokens": 265910691.0, + "reward": 8.167023658752441, + "reward_std": 0.030368156731128693, + "rewards/accuracy_reward/mean": 7.417023658752441, + "rewards/accuracy_reward/std": 0.04515283927321434, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 558.859375, + "completions/mean_terminated_length": 558.859375, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "epoch": 0.9486404833836858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05304365232586861, + "learning_rate": 3.1988959923312026e-07, + "loss": -0.0135, + "num_tokens": 266039594.0, + "reward": 3.636064052581787, + "reward_std": 2.994330406188965, + "rewards/accuracy_reward/mean": 2.886064052581787, + "rewards/accuracy_reward/std": 3.693687915802002, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 428.6875, + "completions/mean_terminated_length": 428.6875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.9492447129909366, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030391313135623932, + "learning_rate": 3.194308460208463e-07, + "loss": 0.0236, + "num_tokens": 266171526.0, + "reward": 4.140684604644775, + "reward_std": 1.2394317388534546, + "rewards/accuracy_reward/mean": 3.3945908546447754, + "rewards/accuracy_reward/std": 3.7098939418792725, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 978.0, + "completions/max_terminated_length": 978.0, + "completions/mean_length": 601.5, + "completions/mean_terminated_length": 601.5, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.9498489425981873, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04239743575453758, + "learning_rate": 3.1897740693605444e-07, + "loss": 0.0347, + "num_tokens": 266306630.0, + "reward": 5.07401180267334, + "reward_std": 1.8202790021896362, + "rewards/accuracy_reward/mean": 4.324012279510498, + "rewards/accuracy_reward/std": 3.696305990219116, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 599.21875, + "completions/mean_terminated_length": 599.21875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "epoch": 0.950453172205438, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.034064218401908875, + "learning_rate": 3.185292837897239e-07, + "loss": 0.0351, + "num_tokens": 266449396.0, + "reward": 1.660632848739624, + "reward_std": 1.0191481113433838, + "rewards/accuracy_reward/mean": 0.910632848739624, + "rewards/accuracy_reward/std": 2.4791388511657715, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 527.78125, + "completions/mean_terminated_length": 527.78125, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.9510574018126888, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.040482260286808014, + "learning_rate": 3.180864783716023e-07, + "loss": 0.0107, + "num_tokens": 266604006.0, + "reward": 3.5555062294006348, + "reward_std": 2.297630786895752, + "rewards/accuracy_reward/mean": 2.8055062294006348, + "rewards/accuracy_reward/std": 3.594943046569824, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1279.0, + "completions/mean_length": 564.0625, + "completions/mean_terminated_length": 540.5079956054688, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.9516616314199395, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0028214247431606054, + "learning_rate": 3.1764899245019985e-07, + "loss": -0.0135, + "num_tokens": 266739258.0, + "reward": 4.479732990264893, + "reward_std": 0.15146946907043457, + "rewards/accuracy_reward/mean": 3.7414517402648926, + "rewards/accuracy_reward/std": 3.716810941696167, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 628.0, + "completions/mean_terminated_length": 628.0, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "epoch": 0.9522658610271904, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.039812132716178894, + "learning_rate": 3.172168277727805e-07, + "loss": -0.0118, + "num_tokens": 266917146.0, + "reward": 2.9230501651763916, + "reward_std": 1.8849380016326904, + "rewards/accuracy_reward/mean": 2.1730499267578125, + "rewards/accuracy_reward/std": 3.315951347351074, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 580.8125, + "completions/mean_terminated_length": 580.8125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.9528700906344411, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02997867576777935, + "learning_rate": 3.167899860653562e-07, + "loss": 0.005, + "num_tokens": 267062654.0, + "reward": 3.4588546752929688, + "reward_std": 0.9800798296928406, + "rewards/accuracy_reward/mean": 2.7088546752929688, + "rewards/accuracy_reward/std": 3.583362579345703, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1094.0, + "completions/max_terminated_length": 1094.0, + "completions/mean_length": 518.703125, + "completions/mean_terminated_length": 518.703125, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.9534743202416919, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0298696830868721, + "learning_rate": 3.1636846903267967e-07, + "loss": -0.009, + "num_tokens": 267287195.0, + "reward": 5.9608917236328125, + "reward_std": 1.3174362182617188, + "rewards/accuracy_reward/mean": 5.210892200469971, + "rewards/accuracy_reward/std": 3.4131553173065186, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 498.34375, + "completions/mean_terminated_length": 498.34375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.9540785498489426, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.030357209965586662, + "learning_rate": 3.1595227835823726e-07, + "loss": 0.0054, + "num_tokens": 267445889.0, + "reward": 3.477776527404785, + "reward_std": 1.291843295097351, + "rewards/accuracy_reward/mean": 2.727776527404785, + "rewards/accuracy_reward/std": 3.452298641204834, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.0, + "completions/max_terminated_length": 743.0, + "completions/mean_length": 546.953125, + "completions/mean_terminated_length": 546.953125, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "epoch": 0.9546827794561934, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.033656008541584015, + "learning_rate": 3.1554141570424297e-07, + "loss": -0.0011, + "num_tokens": 267587310.0, + "reward": 3.350384473800659, + "reward_std": 1.4835050106048584, + "rewards/accuracy_reward/mean": 2.60038423538208, + "rewards/accuracy_reward/std": 3.5692436695098877, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 706.0, + "completions/max_terminated_length": 706.0, + "completions/mean_length": 439.96875, + "completions/mean_terminated_length": 439.96875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.9552870090634441, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01700153760612011, + "learning_rate": 3.151358827116307e-07, + "loss": 0.0083, + "num_tokens": 267734684.0, + "reward": 4.312502861022949, + "reward_std": 0.528846263885498, + "rewards/accuracy_reward/mean": 3.5625030994415283, + "rewards/accuracy_reward/std": 3.738476514816284, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 501.984375, + "completions/mean_terminated_length": 501.984375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.9558912386706948, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03920350968837738, + "learning_rate": 3.1473568100004905e-07, + "loss": 0.0015, + "num_tokens": 267877787.0, + "reward": 4.5562920570373535, + "reward_std": 1.6299071311950684, + "rewards/accuracy_reward/mean": 3.8062920570373535, + "rewards/accuracy_reward/std": 3.6385576725006104, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 836.0, + "completions/max_terminated_length": 836.0, + "completions/mean_length": 567.203125, + "completions/mean_terminated_length": 567.203125, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "epoch": 0.9564954682779456, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02893051691353321, + "learning_rate": 3.143408121678536e-07, + "loss": 0.0035, + "num_tokens": 268088808.0, + "reward": 6.317970275878906, + "reward_std": 1.3729395866394043, + "rewards/accuracy_reward/mean": 5.567970275878906, + "rewards/accuracy_reward/std": 3.3026962280273438, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1024.0, + "completions/max_terminated_length": 1024.0, + "completions/mean_length": 660.140625, + "completions/mean_terminated_length": 660.140625, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.9570996978851963, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.019024930894374847, + "learning_rate": 3.1395127779210154e-07, + "loss": 0.0016, + "num_tokens": 268254753.0, + "reward": 4.6248626708984375, + "reward_std": 0.47344785928726196, + "rewards/accuracy_reward/mean": 3.8748624324798584, + "rewards/accuracy_reward/std": 3.716935157775879, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 566.328125, + "completions/mean_terminated_length": 566.328125, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.9577039274924471, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.019898107275366783, + "learning_rate": 3.135670794285442e-07, + "loss": -0.0001, + "num_tokens": 268400214.0, + "reward": 8.026036262512207, + "reward_std": 0.5609511733055115, + "rewards/accuracy_reward/mean": 7.279942512512207, + "rewards/accuracy_reward/std": 1.0537676811218262, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 825.0, + "completions/max_terminated_length": 825.0, + "completions/mean_length": 587.4375, + "completions/mean_terminated_length": 587.4375, + "completions/min_length": 397.0, + "completions/min_terminated_length": 397.0, + "epoch": 0.9583081570996979, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.031721677631139755, + "learning_rate": 3.131882186116225e-07, + "loss": 0.0054, + "num_tokens": 268539154.0, + "reward": 2.2202250957489014, + "reward_std": 0.7726206183433533, + "rewards/accuracy_reward/mean": 1.4702249765396118, + "rewards/accuracy_reward/std": 2.976858139038086, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 641.1875, + "completions/mean_terminated_length": 641.1875, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "epoch": 0.9589123867069487, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04734215512871742, + "learning_rate": 3.128146968544591e-07, + "loss": -0.003, + "num_tokens": 268701342.0, + "reward": 4.213582992553711, + "reward_std": 2.3552331924438477, + "rewards/accuracy_reward/mean": 3.463582754135132, + "rewards/accuracy_reward/std": 3.750333309173584, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1496.0, + "completions/mean_length": 601.921875, + "completions/mean_terminated_length": 578.96826171875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.9595166163141994, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04395764693617821, + "learning_rate": 3.1244651564885326e-07, + "loss": -0.0501, + "num_tokens": 268825897.0, + "reward": 3.221529722213745, + "reward_std": 1.409614086151123, + "rewards/accuracy_reward/mean": 2.483248472213745, + "rewards/accuracy_reward/std": 3.633685827255249, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 458.328125, + "completions/mean_terminated_length": 458.328125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.9601208459214502, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.016944831237196922, + "learning_rate": 3.1208367646527516e-07, + "loss": -0.0139, + "num_tokens": 268935982.0, + "reward": 4.5289692878723145, + "reward_std": 0.5810385346412659, + "rewards/accuracy_reward/mean": 3.7789692878723145, + "rewards/accuracy_reward/std": 3.759232759475708, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 512.21875, + "completions/mean_terminated_length": 512.21875, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "epoch": 0.9607250755287009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04819463938474655, + "learning_rate": 3.1172618075285904e-07, + "loss": -0.0119, + "num_tokens": 269131548.0, + "reward": 6.136891841888428, + "reward_std": 2.360116481781006, + "rewards/accuracy_reward/mean": 5.386892318725586, + "rewards/accuracy_reward/std": 3.3341994285583496, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.0, + "completions/max_terminated_length": 1438.0, + "completions/mean_length": 712.046875, + "completions/mean_terminated_length": 712.046875, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.9613293051359516, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02325456775724888, + "learning_rate": 3.1137402993939836e-07, + "loss": -0.004, + "num_tokens": 269283183.0, + "reward": 2.401172161102295, + "reward_std": 0.6248627305030823, + "rewards/accuracy_reward/mean": 1.6628906726837158, + "rewards/accuracy_reward/std": 3.2381136417388916, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 513.78125, + "completions/mean_terminated_length": 513.78125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.9619335347432024, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03418084233999252, + "learning_rate": 3.110272254313397e-07, + "loss": 0.0245, + "num_tokens": 269402641.0, + "reward": 5.364365577697754, + "reward_std": 1.404150366783142, + "rewards/accuracy_reward/mean": 4.614365577697754, + "rewards/accuracy_reward/std": 3.6031739711761475, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1123.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 475.046875, + "completions/mean_terminated_length": 475.046875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.9625377643504531, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.036476023495197296, + "learning_rate": 3.106857686137769e-07, + "loss": -0.0098, + "num_tokens": 269639492.0, + "reward": 4.1374125480651855, + "reward_std": 1.8764549493789673, + "rewards/accuracy_reward/mean": 3.3874125480651855, + "rewards/accuracy_reward/std": 3.694791078567505, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 587.90625, + "completions/mean_terminated_length": 580.7142944335938, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "epoch": 0.963141993957704, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03841537609696388, + "learning_rate": 3.1034966085044613e-07, + "loss": 0.0091, + "num_tokens": 269812813.0, + "reward": 4.558248519897461, + "reward_std": 1.7040300369262695, + "rewards/accuracy_reward/mean": 3.819967269897461, + "rewards/accuracy_reward/std": 3.7653045654296875, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 599.921875, + "completions/mean_terminated_length": 599.921875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.9637462235649547, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.037346597760915756, + "learning_rate": 3.100189034837199e-07, + "loss": 0.0095, + "num_tokens": 270008104.0, + "reward": 2.8910155296325684, + "reward_std": 1.262922763824463, + "rewards/accuracy_reward/mean": 2.1410155296325684, + "rewards/accuracy_reward/std": 3.2805261611938477, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 989.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 599.828125, + "completions/mean_terminated_length": 599.828125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.9643504531722055, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.0510229617357254, + "learning_rate": 3.0969349783460157e-07, + "loss": -0.0042, + "num_tokens": 270217821.0, + "reward": 4.703251361846924, + "reward_std": 2.377859592437744, + "rewards/accuracy_reward/mean": 3.953251361846924, + "rewards/accuracy_reward/std": 3.762789011001587, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1218.0, + "completions/max_terminated_length": 1218.0, + "completions/mean_length": 548.125, + "completions/mean_terminated_length": 548.125, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "epoch": 0.9649546827794562, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.029783250764012337, + "learning_rate": 3.093734452027213e-07, + "loss": 0.0039, + "num_tokens": 270385557.0, + "reward": 4.099041938781738, + "reward_std": 1.2282915115356445, + "rewards/accuracy_reward/mean": 3.3490421772003174, + "rewards/accuracy_reward/std": 3.7756049633026123, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 687.828125, + "completions/mean_terminated_length": 687.828125, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.965558912386707, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04966485872864723, + "learning_rate": 3.090587468663292e-07, + "loss": -0.0062, + "num_tokens": 270624218.0, + "reward": 1.800739049911499, + "reward_std": 1.6842288970947266, + "rewards/accuracy_reward/mean": 1.050739049911499, + "rewards/accuracy_reward/std": 2.6043312549591064, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 497.046875, + "completions/mean_terminated_length": 497.046875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "epoch": 0.9661631419939577, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03489319607615471, + "learning_rate": 3.087494040822913e-07, + "loss": -0.0317, + "num_tokens": 270789309.0, + "reward": 5.595579624176025, + "reward_std": 1.4162607192993164, + "rewards/accuracy_reward/mean": 4.845579624176025, + "rewards/accuracy_reward/std": 3.5701370239257812, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 473.140625, + "completions/mean_terminated_length": 473.140625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.9667673716012085, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.021849358454346657, + "learning_rate": 3.084454180860842e-07, + "loss": -0.0104, + "num_tokens": 270988070.0, + "reward": 4.088839054107666, + "reward_std": 0.7576325535774231, + "rewards/accuracy_reward/mean": 3.338839054107666, + "rewards/accuracy_reward/std": 3.6972694396972656, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 654.03125, + "completions/mean_terminated_length": 654.03125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 0.9673716012084592, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05162634328007698, + "learning_rate": 3.081467900917899e-07, + "loss": 0.0226, + "num_tokens": 271137928.0, + "reward": 4.327084064483643, + "reward_std": 2.1479854583740234, + "rewards/accuracy_reward/mean": 3.5770843029022217, + "rewards/accuracy_reward/std": 3.690094232559204, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 632.59375, + "completions/mean_terminated_length": 632.59375, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "epoch": 0.9679758308157099, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.052328094840049744, + "learning_rate": 3.078535212920916e-07, + "loss": 0.0014, + "num_tokens": 271331982.0, + "reward": 3.236067295074463, + "reward_std": 2.383765697479248, + "rewards/accuracy_reward/mean": 2.486067056655884, + "rewards/accuracy_reward/std": 3.511209487915039, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1261.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 707.046875, + "completions/mean_terminated_length": 707.046875, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "epoch": 0.9685800604229607, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.01500303577631712, + "learning_rate": 3.0756561285826816e-07, + "loss": -0.0079, + "num_tokens": 271504849.0, + "reward": 4.452210903167725, + "reward_std": 0.5281606316566467, + "rewards/accuracy_reward/mean": 3.7022109031677246, + "rewards/accuracy_reward/std": 3.6719717979431152, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 789.0, + "completions/max_terminated_length": 789.0, + "completions/mean_length": 496.171875, + "completions/mean_terminated_length": 496.171875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.9691842900302114, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.010892923921346664, + "learning_rate": 3.072830659401903e-07, + "loss": -0.005, + "num_tokens": 271715052.0, + "reward": 6.208377838134766, + "reward_std": 0.5148655772209167, + "rewards/accuracy_reward/mean": 5.458378314971924, + "rewards/accuracy_reward/std": 3.331895589828491, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 545.28125, + "completions/mean_terminated_length": 545.28125, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.9697885196374623, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03507222607731819, + "learning_rate": 3.0700588166631506e-07, + "loss": 0.0118, + "num_tokens": 271886734.0, + "reward": 5.772264003753662, + "reward_std": 0.8876798152923584, + "rewards/accuracy_reward/mean": 5.022264003753662, + "rewards/accuracy_reward/std": 3.509002923965454, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 581.671875, + "completions/mean_terminated_length": 581.671875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.970392749244713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.042440757155418396, + "learning_rate": 3.0673406114368184e-07, + "loss": -0.0048, + "num_tokens": 272047321.0, + "reward": 4.04852819442749, + "reward_std": 1.8281769752502441, + "rewards/accuracy_reward/mean": 3.2985281944274902, + "rewards/accuracy_reward/std": 3.6795897483825684, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1115.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 644.109375, + "completions/mean_terminated_length": 644.109375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "epoch": 0.9709969788519638, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.004062322899699211, + "learning_rate": 3.06467605457908e-07, + "loss": -0.001, + "num_tokens": 272250688.0, + "reward": 2.595961093902588, + "reward_std": 0.16064950823783875, + "rewards/accuracy_reward/mean": 1.8459609746932983, + "rewards/accuracy_reward/std": 3.282470703125, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 631.703125, + "completions/mean_terminated_length": 631.703125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.9716012084592145, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.043827250599861145, + "learning_rate": 3.0620651567318436e-07, + "loss": 0.0085, + "num_tokens": 272433021.0, + "reward": 6.3395843505859375, + "reward_std": 1.5198171138763428, + "rewards/accuracy_reward/mean": 5.5895843505859375, + "rewards/accuracy_reward/std": 3.2527709007263184, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 505.5, + "completions/mean_terminated_length": 505.5, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "epoch": 0.9722054380664653, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.015553508885204792, + "learning_rate": 3.0595079283227115e-07, + "loss": -0.0004, + "num_tokens": 272576205.0, + "reward": 4.373162269592285, + "reward_std": 0.5114259123802185, + "rewards/accuracy_reward/mean": 3.623161792755127, + "rewards/accuracy_reward/std": 3.739793062210083, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 504.609375, + "completions/mean_terminated_length": 504.609375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "epoch": 0.972809667673716, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0003819653647951782, + "learning_rate": 3.0570043795649326e-07, + "loss": -0.0003, + "num_tokens": 272760180.0, + "reward": 4.46827507019043, + "reward_std": 0.01758105307817459, + "rewards/accuracy_reward/mean": 3.7182750701904297, + "rewards/accuracy_reward/std": 3.7477633953094482, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 551.8125, + "completions/mean_terminated_length": 551.8125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.9734138972809667, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.00043267227010801435, + "learning_rate": 3.0545545204573714e-07, + "loss": -0.0004, + "num_tokens": 272948104.0, + "reward": 6.34544038772583, + "reward_std": 0.02393009513616562, + "rewards/accuracy_reward/mean": 5.595440864562988, + "rewards/accuracy_reward/std": 3.256223440170288, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 826.0, + "completions/max_terminated_length": 826.0, + "completions/mean_length": 442.3125, + "completions/mean_terminated_length": 442.3125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.9740181268882175, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053487204015254974, + "learning_rate": 3.05215836078446e-07, + "loss": 0.0099, + "num_tokens": 273099052.0, + "reward": 5.4932732582092285, + "reward_std": 2.9659130573272705, + "rewards/accuracy_reward/mean": 4.747179985046387, + "rewards/accuracy_reward/std": 3.6186952590942383, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 567.109375, + "completions/mean_terminated_length": 567.109375, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.9746223564954682, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03990940377116203, + "learning_rate": 3.04981591011616e-07, + "loss": 0.0042, + "num_tokens": 273288819.0, + "reward": 4.961240768432617, + "reward_std": 1.5610487461090088, + "rewards/accuracy_reward/mean": 4.211240768432617, + "rewards/accuracy_reward/std": 3.728358745574951, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 951.0, + "completions/max_terminated_length": 951.0, + "completions/mean_length": 565.015625, + "completions/mean_terminated_length": 565.015625, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.9752265861027191, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04232166334986687, + "learning_rate": 3.047527177807929e-07, + "loss": -0.0095, + "num_tokens": 273427748.0, + "reward": 7.15316104888916, + "reward_std": 1.4412761926651, + "rewards/accuracy_reward/mean": 6.40316104888916, + "rewards/accuracy_reward/std": 2.6108193397521973, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 476.25, + "completions/mean_terminated_length": 476.25, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.9758308157099698, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.013810059055685997, + "learning_rate": 3.045292173000678e-07, + "loss": -0.0049, + "num_tokens": 273567524.0, + "reward": 2.715712547302246, + "reward_std": 0.480778306722641, + "rewards/accuracy_reward/mean": 1.965712547302246, + "rewards/accuracy_reward/std": 3.3323447704315186, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1346.0, + "completions/mean_length": 867.859375, + "completions/mean_terminated_length": 789.183349609375, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "epoch": 0.9764350453172206, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04717731475830078, + "learning_rate": 3.0431109046207366e-07, + "loss": -0.0228, + "num_tokens": 273725867.0, + "reward": 1.2942702770233154, + "reward_std": 1.9348382949829102, + "rewards/accuracy_reward/mean": 0.5911452770233154, + "rewards/accuracy_reward/std": 2.2270898818969727, + "rewards/tag_count_reward/mean": 0.703125, + "rewards/tag_count_reward/std": 0.18298126757144928, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 746.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 475.546875, + "completions/mean_terminated_length": 475.546875, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.9770392749244713, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0572805181145668, + "learning_rate": 3.0409833813798234e-07, + "loss": -0.0008, + "num_tokens": 273954798.0, + "reward": 5.404472351074219, + "reward_std": 2.531865119934082, + "rewards/accuracy_reward/mean": 4.654472351074219, + "rewards/accuracy_reward/std": 3.5854618549346924, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 761.0, + "completions/max_terminated_length": 761.0, + "completions/mean_length": 504.15625, + "completions/mean_terminated_length": 504.15625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "epoch": 0.9776435045317221, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03528394550085068, + "learning_rate": 3.0389096117749956e-07, + "loss": -0.0139, + "num_tokens": 274099832.0, + "reward": 7.453335762023926, + "reward_std": 1.8961772918701172, + "rewards/accuracy_reward/mean": 6.703335762023926, + "rewards/accuracy_reward/std": 2.2309465408325195, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 775.0, + "completions/max_terminated_length": 775.0, + "completions/mean_length": 546.8125, + "completions/mean_terminated_length": 546.8125, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.9782477341389728, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05206969752907753, + "learning_rate": 3.0368896040886336e-07, + "loss": -0.0017, + "num_tokens": 274313580.0, + "reward": 4.828471660614014, + "reward_std": 2.273467779159546, + "rewards/accuracy_reward/mean": 4.078472137451172, + "rewards/accuracy_reward/std": 3.792839288711548, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 875.0, + "completions/max_terminated_length": 875.0, + "completions/mean_length": 562.234375, + "completions/mean_terminated_length": 562.234375, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.9788519637462235, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.020315414294600487, + "learning_rate": 3.0349233663883985e-07, + "loss": 0.0007, + "num_tokens": 274486859.0, + "reward": 2.397218704223633, + "reward_std": 0.5973656177520752, + "rewards/accuracy_reward/mean": 1.6472187042236328, + "rewards/accuracy_reward/std": 3.1029999256134033, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1709.0, + "completions/max_terminated_length": 1709.0, + "completions/mean_length": 577.4375, + "completions/mean_terminated_length": 577.4375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.9794561933534743, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04160207882523537, + "learning_rate": 3.033010906527195e-07, + "loss": 0.0325, + "num_tokens": 274627511.0, + "reward": 5.233031272888184, + "reward_std": 1.438629388809204, + "rewards/accuracy_reward/mean": 4.483031272888184, + "rewards/accuracy_reward/std": 3.6524763107299805, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 634.515625, + "completions/mean_terminated_length": 634.515625, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "epoch": 0.980060422960725, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04395109415054321, + "learning_rate": 3.031152232143153e-07, + "loss": -0.0165, + "num_tokens": 274826520.0, + "reward": 4.062590599060059, + "reward_std": 1.7938520908355713, + "rewards/accuracy_reward/mean": 3.3164968490600586, + "rewards/accuracy_reward/std": 3.799056053161621, + "rewards/tag_count_reward/mean": 0.74609375, + "rewards/tag_count_reward/std": 0.03125, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 598.578125, + "completions/mean_terminated_length": 598.578125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.9806646525679759, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018429970368742943, + "learning_rate": 3.0293473506595824e-07, + "loss": -0.0022, + "num_tokens": 274989261.0, + "reward": 6.1684465408325195, + "reward_std": 0.5345081090927124, + "rewards/accuracy_reward/mean": 5.418447017669678, + "rewards/accuracy_reward/std": 3.3230600357055664, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 605.25, + "completions/mean_terminated_length": 605.25, + "completions/min_length": 424.0, + "completions/min_terminated_length": 424.0, + "epoch": 0.9812688821752266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.053192444145679474, + "learning_rate": 3.0275962692849593e-07, + "loss": -0.0125, + "num_tokens": 275168173.0, + "reward": 5.279561996459961, + "reward_std": 2.3665473461151123, + "rewards/accuracy_reward/mean": 4.529562473297119, + "rewards/accuracy_reward/std": 3.6766843795776367, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 969.0, + "completions/max_terminated_length": 969.0, + "completions/mean_length": 557.9375, + "completions/mean_terminated_length": 557.9375, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.9818731117824774, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.039757516235113144, + "learning_rate": 3.025898995012881e-07, + "loss": 0.0007, + "num_tokens": 275320777.0, + "reward": 3.4244375228881836, + "reward_std": 0.9636476635932922, + "rewards/accuracy_reward/mean": 2.6744375228881836, + "rewards/accuracy_reward/std": 3.5990397930145264, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1167.0, + "completions/max_terminated_length": 1167.0, + "completions/mean_length": 626.296875, + "completions/mean_terminated_length": 626.296875, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "epoch": 0.9824773413897281, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03774777054786682, + "learning_rate": 3.024255534622053e-07, + "loss": 0.0176, + "num_tokens": 275476940.0, + "reward": 4.1175408363342285, + "reward_std": 1.1286678314208984, + "rewards/accuracy_reward/mean": 3.3675405979156494, + "rewards/accuracy_reward/std": 3.7485992908477783, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 506.015625, + "completions/mean_terminated_length": 506.015625, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "epoch": 0.9830815709969789, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03559008240699768, + "learning_rate": 3.022665894676248e-07, + "loss": 0.0049, + "num_tokens": 275606637.0, + "reward": 5.598783016204834, + "reward_std": 0.9304074645042419, + "rewards/accuracy_reward/mean": 4.848782539367676, + "rewards/accuracy_reward/std": 3.537518262863159, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 628.0, + "completions/max_terminated_length": 628.0, + "completions/mean_length": 461.8125, + "completions/mean_terminated_length": 461.8125, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.9836858006042296, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0378757081925869, + "learning_rate": 3.0211300815242925e-07, + "loss": 0.0015, + "num_tokens": 275765921.0, + "reward": 6.809762477874756, + "reward_std": 2.0407352447509766, + "rewards/accuracy_reward/mean": 6.059762477874756, + "rewards/accuracy_reward/std": 2.8317224979400635, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 548.75, + "completions/mean_terminated_length": 548.75, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.9842900302114803, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024274898692965508, + "learning_rate": 3.019648101300034e-07, + "loss": -0.0208, + "num_tokens": 275926337.0, + "reward": 5.650043487548828, + "reward_std": 0.9427006840705872, + "rewards/accuracy_reward/mean": 4.900043964385986, + "rewards/accuracy_reward/std": 3.558845281600952, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 613.4375, + "completions/mean_terminated_length": 613.4375, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 0.9848942598187311, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04356250539422035, + "learning_rate": 3.018219959922312e-07, + "loss": 0.0049, + "num_tokens": 276079277.0, + "reward": 3.0614562034606934, + "reward_std": 1.857572078704834, + "rewards/accuracy_reward/mean": 2.3114562034606934, + "rewards/accuracy_reward/std": 3.49177885055542, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 497.90625, + "completions/mean_terminated_length": 497.90625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.9854984894259818, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.005168252624571323, + "learning_rate": 3.0168456630949496e-07, + "loss": -0.0, + "num_tokens": 276237303.0, + "reward": 4.2811079025268555, + "reward_std": 0.2222902774810791, + "rewards/accuracy_reward/mean": 3.5311081409454346, + "rewards/accuracy_reward/std": 3.9206018447875977, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 972.0, + "completions/max_terminated_length": 972.0, + "completions/mean_length": 497.125, + "completions/mean_terminated_length": 497.125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.9861027190332327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.013100476935505867, + "learning_rate": 3.015525216306716e-07, + "loss": -0.0042, + "num_tokens": 276374479.0, + "reward": 6.23393440246582, + "reward_std": 0.5377016067504883, + "rewards/accuracy_reward/mean": 5.48393440246582, + "rewards/accuracy_reward/std": 3.115086555480957, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.0, + "completions/max_terminated_length": 716.0, + "completions/mean_length": 524.140625, + "completions/mean_terminated_length": 524.140625, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.9867069486404834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0394965223968029, + "learning_rate": 3.0142586248313107e-07, + "loss": 0.003, + "num_tokens": 276527864.0, + "reward": 6.114754676818848, + "reward_std": 2.4936323165893555, + "rewards/accuracy_reward/mean": 5.364754676818848, + "rewards/accuracy_reward/std": 3.3825266361236572, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1968.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 782.234375, + "completions/mean_terminated_length": 782.234375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.9873111782477342, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.048109784722328186, + "learning_rate": 3.0130458937273436e-07, + "loss": 0.0264, + "num_tokens": 276699815.0, + "reward": 1.671739101409912, + "reward_std": 1.7840732336044312, + "rewards/accuracy_reward/mean": 0.9217391014099121, + "rewards/accuracy_reward/std": 2.307060956954956, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 552.5625, + "completions/mean_terminated_length": 552.5625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.9879154078549849, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0495900958776474, + "learning_rate": 3.011887027838309e-07, + "loss": 0.0028, + "num_tokens": 276895419.0, + "reward": 7.329078197479248, + "reward_std": 2.17859148979187, + "rewards/accuracy_reward/mean": 6.579078197479248, + "rewards/accuracy_reward/std": 2.53115177154541, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 583.375, + "completions/mean_terminated_length": 583.375, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.9885196374622357, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.013938535004854202, + "learning_rate": 3.0107820317925757e-07, + "loss": 0.0046, + "num_tokens": 277019379.0, + "reward": 6.191319942474365, + "reward_std": 0.47322311997413635, + "rewards/accuracy_reward/mean": 5.441319465637207, + "rewards/accuracy_reward/std": 3.2992260456085205, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 494.59375, + "completions/mean_terminated_length": 494.59375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.9891238670694864, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02087569795548916, + "learning_rate": 3.00973091000336e-07, + "loss": -0.0012, + "num_tokens": 277243913.0, + "reward": 5.970442295074463, + "reward_std": 0.7569543123245239, + "rewards/accuracy_reward/mean": 5.220442295074463, + "rewards/accuracy_reward/std": 3.4192967414855957, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 539.1875, + "completions/mean_terminated_length": 539.1875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "epoch": 0.9897280966767371, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01654951460659504, + "learning_rate": 3.0087336666687105e-07, + "loss": -0.0014, + "num_tokens": 277442757.0, + "reward": 4.332117080688477, + "reward_std": 0.5315866470336914, + "rewards/accuracy_reward/mean": 3.5821170806884766, + "rewards/accuracy_reward/std": 3.7588038444519043, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1403.0, + "completions/max_terminated_length": 1403.0, + "completions/mean_length": 570.828125, + "completions/mean_terminated_length": 570.828125, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.9903323262839879, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.02204856649041176, + "learning_rate": 3.007790305771493e-07, + "loss": -0.0074, + "num_tokens": 277643882.0, + "reward": 4.548664093017578, + "reward_std": 0.9771441221237183, + "rewards/accuracy_reward/mean": 3.798664093017578, + "rewards/accuracy_reward/std": 3.664637804031372, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 434.140625, + "completions/mean_terminated_length": 434.140625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.9909365558912386, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.00021549616940319538, + "learning_rate": 3.0069008310793726e-07, + "loss": -0.0001, + "num_tokens": 277845699.0, + "reward": 2.5911452770233154, + "reward_std": 0.007079769391566515, + "rewards/accuracy_reward/mean": 1.8411452770233154, + "rewards/accuracy_reward/std": 3.2141966819763184, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 758.0, + "completions/max_terminated_length": 758.0, + "completions/mean_length": 506.15625, + "completions/mean_terminated_length": 506.15625, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.9915407854984895, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.055746935307979584, + "learning_rate": 3.0060652461448024e-07, + "loss": 0.0427, + "num_tokens": 277956765.0, + "reward": 6.634885311126709, + "reward_std": 2.8072409629821777, + "rewards/accuracy_reward/mean": 5.884885787963867, + "rewards/accuracy_reward/std": 3.0201563835144043, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 703.0, + "completions/max_terminated_length": 703.0, + "completions/mean_length": 513.46875, + "completions/mean_terminated_length": 513.46875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.9921450151057402, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0017342359060421586, + "learning_rate": 3.0052835543050023e-07, + "loss": -0.0005, + "num_tokens": 278123275.0, + "reward": 6.405112266540527, + "reward_std": 0.06731852889060974, + "rewards/accuracy_reward/mean": 5.6551127433776855, + "rewards/accuracy_reward/std": 3.1039798259735107, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1041.0, + "completions/max_terminated_length": 1041.0, + "completions/mean_length": 622.078125, + "completions/mean_terminated_length": 622.078125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "epoch": 0.992749244712991, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03694729506969452, + "learning_rate": 3.0045557586819545e-07, + "loss": -0.0207, + "num_tokens": 278305152.0, + "reward": 3.54154372215271, + "reward_std": 1.5797648429870605, + "rewards/accuracy_reward/mean": 2.79154372215271, + "rewards/accuracy_reward/std": 3.517258882522583, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 442.796875, + "completions/mean_terminated_length": 442.796875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.9933534743202417, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03596871718764305, + "learning_rate": 3.003881862182383e-07, + "loss": 0.0035, + "num_tokens": 278458307.0, + "reward": 4.348369598388672, + "reward_std": 1.7489923238754272, + "rewards/accuracy_reward/mean": 3.598369598388672, + "rewards/accuracy_reward/std": 3.7423391342163086, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 766.0, + "completions/max_terminated_length": 766.0, + "completions/mean_length": 524.546875, + "completions/mean_terminated_length": 524.546875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.9939577039274925, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032491933554410934, + "learning_rate": 3.0032618674977464e-07, + "loss": 0.0079, + "num_tokens": 278606022.0, + "reward": 6.910118579864502, + "reward_std": 0.9148675799369812, + "rewards/accuracy_reward/mean": 6.16011905670166, + "rewards/accuracy_reward/std": 2.8287153244018555, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 554.21875, + "completions/mean_terminated_length": 530.5079956054688, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.9945619335347432, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024080799892544746, + "learning_rate": 3.002695777104225e-07, + "loss": -0.0356, + "num_tokens": 278759588.0, + "reward": 4.100481033325195, + "reward_std": 1.2019246816635132, + "rewards/accuracy_reward/mean": 3.3622002601623535, + "rewards/accuracy_reward/std": 3.7459347248077393, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 535.0625, + "completions/mean_terminated_length": 535.0625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.9951661631419939, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.05346053093671799, + "learning_rate": 3.002183593262716e-07, + "loss": 0.0241, + "num_tokens": 278912696.0, + "reward": 3.8614139556884766, + "reward_std": 1.8585524559020996, + "rewards/accuracy_reward/mean": 3.1114139556884766, + "rewards/accuracy_reward/std": 3.738966703414917, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 505.921875, + "completions/mean_terminated_length": 505.921875, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "epoch": 0.9957703927492447, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.026339000090956688, + "learning_rate": 3.0017253180188163e-07, + "loss": -0.0015, + "num_tokens": 279065315.0, + "reward": 1.067490577697754, + "reward_std": 1.1695621013641357, + "rewards/accuracy_reward/mean": 0.3174906373023987, + "rewards/accuracy_reward/std": 1.6016401052474976, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 569.25, + "completions/mean_terminated_length": 545.77783203125, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.9963746223564954, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.019240394234657288, + "learning_rate": 3.00132095320282e-07, + "loss": 0.0056, + "num_tokens": 279193747.0, + "reward": 4.290842056274414, + "reward_std": 0.786018431186676, + "rewards/accuracy_reward/mean": 3.556467056274414, + "rewards/accuracy_reward/std": 3.662733554840088, + "rewards/tag_count_reward/mean": 0.734375, + "rewards/tag_count_reward/std": 0.09834947437047958, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 574.15625, + "completions/mean_terminated_length": 574.15625, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.9969788519637462, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04065730795264244, + "learning_rate": 3.000970500429711e-07, + "loss": 0.0044, + "num_tokens": 279337661.0, + "reward": 7.575520038604736, + "reward_std": 1.7820308208465576, + "rewards/accuracy_reward/mean": 6.8255205154418945, + "rewards/accuracy_reward/std": 2.064950704574585, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 555.9375, + "completions/mean_terminated_length": 555.9375, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.997583081570997, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.04509393498301506, + "learning_rate": 3.000673961099151e-07, + "loss": -0.0055, + "num_tokens": 279510169.0, + "reward": 3.579514265060425, + "reward_std": 1.5520298480987549, + "rewards/accuracy_reward/mean": 2.8295140266418457, + "rewards/accuracy_reward/std": 3.6128878593444824, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1243.0, + "completions/max_terminated_length": 1243.0, + "completions/mean_length": 593.21875, + "completions/mean_terminated_length": 593.21875, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "epoch": 0.9981873111782478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.044132333248853683, + "learning_rate": 3.0004313363954854e-07, + "loss": 0.0079, + "num_tokens": 279671239.0, + "reward": 5.281437397003174, + "reward_std": 2.7527523040771484, + "rewards/accuracy_reward/mean": 4.531437397003174, + "rewards/accuracy_reward/std": 3.5386111736297607, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.0, + "completions/max_terminated_length": 810.0, + "completions/mean_length": 546.5, + "completions/mean_terminated_length": 546.5, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.9987915407854985, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.043051205575466156, + "learning_rate": 3.000242627287724e-07, + "loss": 0.0156, + "num_tokens": 279850487.0, + "reward": 4.079076290130615, + "reward_std": 1.6832554340362549, + "rewards/accuracy_reward/mean": 3.3290762901306152, + "rewards/accuracy_reward/std": 3.6864542961120605, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 592.828125, + "completions/mean_terminated_length": 569.7301635742188, + "completions/min_length": 326.0, + "completions/min_terminated_length": 326.0, + "epoch": 0.9993957703927493, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05087336152791977, + "learning_rate": 3.0001078345295487e-07, + "loss": -0.0094, + "num_tokens": 280022940.0, + "reward": 6.760767459869385, + "reward_std": 2.507960557937622, + "rewards/accuracy_reward/mean": 6.022485733032227, + "rewards/accuracy_reward/std": 3.001232624053955, + "rewards/tag_count_reward/mean": 0.73828125, + "rewards/tag_count_reward/std": 0.0937500074505806, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 995.0, + "completions/max_terminated_length": 995.0, + "completions/mean_length": 684.625, + "completions/mean_terminated_length": 684.625, + "completions/min_length": 471.0, + "completions/min_terminated_length": 471.0, + "epoch": 1.0, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.004666467662900686, + "learning_rate": 3.0000269586593054e-07, + "loss": 0.0001, + "num_tokens": 280189764.0, + "reward": 2.760242223739624, + "reward_std": 0.15598680078983307, + "rewards/accuracy_reward/mean": 2.010242223739624, + "rewards/accuracy_reward/std": 3.1863667964935303, + "rewards/tag_count_reward/mean": 0.75, + "rewards/tag_count_reward/std": 0.0, + "step": 1655 + } + ], + "logging_steps": 1, + "max_steps": 1655, + "num_input_tokens_seen": 280189764, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}